]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[youtube] Extract channel meta fields (closes #9676, closes #12939)
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
d77ab8e2 9import random
c5e8d7af 10import re
42939b61 11import time
e0df6211 12import traceback
c5e8d7af 13
b05654f0 14from .common import InfoExtractor, SearchInfoExtractor
2b25cb5d 15from ..jsinterp import JSInterpreter
54256267 16from ..swfinterp import SWFInterpreter
4bb4a188 17from ..compat import (
edf3e38e 18 compat_chr,
8d81f3e3 19 compat_kwargs,
c5e8d7af 20 compat_parse_qs,
7fd002c0
S
21 compat_urllib_parse_unquote,
22 compat_urllib_parse_unquote_plus,
15707c7e 23 compat_urllib_parse_urlencode,
7c80519c 24 compat_urllib_parse_urlparse,
7c61bd36 25 compat_urlparse,
c5e8d7af 26 compat_str,
4bb4a188
PH
27)
28from ..utils import (
c5e8d7af 29 clean_html,
9b9c5355 30 error_to_compat_str,
c5e8d7af 31 ExtractorError,
2d30521a 32 float_or_none,
4bb4a188
PH
33 get_element_by_attribute,
34 get_element_by_id,
dd27fd17 35 int_or_none,
94278f72 36 mimetype2ext,
4bb4a188 37 orderedSet,
6310acf5 38 parse_codecs,
7c80519c 39 parse_duration,
54fc90aa 40 qualities,
0cb58b02 41 remove_quotes,
3995d37d 42 remove_start,
cf7e015f 43 smuggle_url,
c93d53f5 44 str_to_int,
556dbe7f 45 try_get,
c5e8d7af
PH
46 unescapeHTML,
47 unified_strdate,
cf7e015f 48 unsmuggle_url,
81c2f20b 49 uppercase_escape,
6e6bc8da 50 urlencode_postdata,
c5e8d7af
PH
51)
52
5f6a1245 53
de7f3446 54class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
55 """Provide base functions for Youtube extractors"""
56 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
9303ce3e 57 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
e00eb564
S
58
59 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
3995d37d
S
60 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
61 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
e00eb564 62
b2e8bc1b
JMF
63 _NETRC_MACHINE = 'youtube'
64 # If True it will raise an error if no login info is provided
65 _LOGIN_REQUIRED = False
66
409b9324 67 _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)[0-9A-Za-z-_]{10,}'
d0ba5587 68
b2e8bc1b 69 def _set_language(self):
810fb84d
PH
70 self._set_cookie(
71 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
42939b61 72 # YouTube sets the expire time to about two months
810fb84d 73 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b 74
25f14e9f
S
75 def _ids_to_results(self, ids):
76 return [
77 self.url_result(vid_id, 'Youtube', video_id=vid_id)
78 for vid_id in ids]
79
b2e8bc1b 80 def _login(self):
83317f69 81 """
82 Attempt to log in to YouTube.
83 True is returned if successful or skipped.
84 False is returned if login failed.
85
86 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
87 """
68217024 88 username, password = self._get_login_info()
b2e8bc1b
JMF
89 # No authentication to be performed
90 if username is None:
70d35d16 91 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
69ea8ca4 92 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 93 return True
b2e8bc1b 94
7cc3570e
PH
95 login_page = self._download_webpage(
96 self._LOGIN_URL, None,
69ea8ca4
PH
97 note='Downloading login page',
98 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
99 if login_page is False:
100 return
b2e8bc1b 101
1212e997 102 login_form = self._hidden_inputs(login_page)
c5e8d7af 103
e00eb564
S
104 def req(url, f_req, note, errnote):
105 data = login_form.copy()
106 data.update({
107 'pstMsg': 1,
108 'checkConnection': 'youtube',
109 'checkedDomains': 'youtube',
110 'hl': 'en',
111 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
3995d37d 112 'f.req': json.dumps(f_req),
e00eb564
S
113 'flowName': 'GlifWebSignIn',
114 'flowEntry': 'ServiceLogin',
041bc3ad 115 })
e00eb564
S
116 return self._download_json(
117 url, None, note=note, errnote=errnote,
118 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
119 fatal=False,
120 data=urlencode_postdata(data), headers={
121 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
122 'Google-Accounts-XSRF': 1,
123 })
124
3995d37d
S
125 def warn(message):
126 self._downloader.report_warning(message)
127
128 lookup_req = [
129 username,
130 None, [], None, 'US', None, None, 2, False, True,
131 [
132 None, None,
133 [2, 1, None, 1,
134 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
135 None, [], 4],
136 1, [None, None, []], None, None, None, True
137 ],
138 username,
139 ]
140
e00eb564 141 lookup_results = req(
3995d37d 142 self._LOOKUP_URL, lookup_req,
e00eb564
S
143 'Looking up account info', 'Unable to look up account info')
144
145 if lookup_results is False:
146 return False
041bc3ad 147
3995d37d
S
148 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
149 if not user_hash:
150 warn('Unable to extract user hash')
151 return False
152
153 challenge_req = [
154 user_hash,
155 None, 1, None, [1, None, None, None, [password, None, True]],
156 [
157 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
158 1, [None, None, []], None, None, None, True
159 ]]
83317f69 160
3995d37d
S
161 challenge_results = req(
162 self._CHALLENGE_URL, challenge_req,
163 'Logging in', 'Unable to log in')
83317f69 164
3995d37d 165 if challenge_results is False:
e00eb564 166 return
83317f69 167
3995d37d
S
168 login_res = try_get(challenge_results, lambda x: x[0][5], list)
169 if login_res:
170 login_msg = try_get(login_res, lambda x: x[5], compat_str)
171 warn(
172 'Unable to login: %s' % 'Invalid password'
173 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
174 return False
175
176 res = try_get(challenge_results, lambda x: x[0][-1], list)
177 if not res:
178 warn('Unable to extract result entry')
179 return False
180
9a6628aa
S
181 login_challenge = try_get(res, lambda x: x[0][0], list)
182 if login_challenge:
183 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
184 if challenge_str == 'TWO_STEP_VERIFICATION':
3995d37d
S
185 # SEND_SUCCESS - TFA code has been successfully sent to phone
186 # QUOTA_EXCEEDED - reached the limit of TFA codes
9a6628aa 187 status = try_get(login_challenge, lambda x: x[5], compat_str)
3995d37d
S
188 if status == 'QUOTA_EXCEEDED':
189 warn('Exceeded the limit of TFA codes, try later')
190 return False
191
192 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
193 if not tl:
194 warn('Unable to extract TL')
195 return False
196
197 tfa_code = self._get_tfa_info('2-step verification code')
198
199 if not tfa_code:
200 warn(
201 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
202 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
203 return False
204
205 tfa_code = remove_start(tfa_code, 'G-')
206
207 tfa_req = [
208 user_hash, None, 2, None,
209 [
210 9, None, None, None, None, None, None, None,
211 [None, tfa_code, True, 2]
212 ]]
213
214 tfa_results = req(
215 self._TFA_URL.format(tl), tfa_req,
216 'Submitting TFA code', 'Unable to submit TFA code')
217
218 if tfa_results is False:
219 return False
220
221 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
222 if tfa_res:
223 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
224 warn(
225 'Unable to finish TFA: %s' % 'Invalid TFA code'
226 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
227 return False
228
229 check_cookie_url = try_get(
230 tfa_results, lambda x: x[0][-1][2], compat_str)
9a6628aa
S
231 else:
232 CHALLENGES = {
233 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
234 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
235 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
236 }
237 challenge = CHALLENGES.get(
238 challenge_str,
239 '%s returned error %s.' % (self.IE_NAME, challenge_str))
240 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
241 return False
3995d37d
S
242 else:
243 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
244
245 if not check_cookie_url:
246 warn('Unable to extract CheckCookie URL')
247 return False
e00eb564
S
248
249 check_cookie_results = self._download_webpage(
3995d37d
S
250 check_cookie_url, None, 'Checking cookie', fatal=False)
251
252 if check_cookie_results is False:
253 return False
e00eb564 254
3995d37d
S
255 if 'https://myaccount.google.com/' not in check_cookie_results:
256 warn('Unable to log in')
b2e8bc1b 257 return False
e00eb564 258
b2e8bc1b
JMF
259 return True
260
30226342 261 def _download_webpage_handle(self, *args, **kwargs):
8d81f3e3 262 kwargs.setdefault('query', {})['disable_polymer'] = 'true'
30226342 263 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
8d81f3e3
S
264 *args, **compat_kwargs(kwargs))
265
b2e8bc1b
JMF
266 def _real_initialize(self):
267 if self._downloader is None:
268 return
42939b61 269 self._set_language()
b2e8bc1b
JMF
270 if not self._login():
271 return
c5e8d7af 272
8377574c 273
8e7aad20 274class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
061a75ed 275 # Extract entries from page with "Load more" button
648e6a1f
S
276 def _entries(self, page, playlist_id):
277 more_widget_html = content_html = page
278 for page_num in itertools.count(1):
061a75ed
S
279 for entry in self._process_page(content_html):
280 yield entry
648e6a1f
S
281
282 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
283 if not mobj:
284 break
285
286 more = self._download_json(
287 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
288 'Downloading page #%s' % page_num,
289 transform_source=uppercase_escape)
290 content_html = more['content_html']
291 if not content_html.strip():
292 # Some webpages show a "Load more" button but they don't
293 # have more videos
294 break
295 more_widget_html = more['load_more_widget_html']
296
061a75ed
S
297
298class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
299 def _process_page(self, content):
300 for video_id, video_title in self.extract_videos_from_page(content):
301 yield self.url_result(video_id, 'Youtube', video_id, video_title)
302
648e6a1f
S
303 def extract_videos_from_page(self, page):
304 ids_in_page = []
305 titles_in_page = []
306 for mobj in re.finditer(self._VIDEO_RE, page):
307 # The link with index 0 is not the first video of the playlist (not sure if still actual)
308 if 'index' in mobj.groupdict() and mobj.group('id') == '0':
309 continue
310 video_id = mobj.group('id')
311 video_title = unescapeHTML(mobj.group('title'))
312 if video_title:
313 video_title = video_title.strip()
314 try:
315 idx = ids_in_page.index(video_id)
316 if video_title and not titles_in_page[idx]:
317 titles_in_page[idx] = video_title
318 except ValueError:
319 ids_in_page.append(video_id)
320 titles_in_page.append(video_title)
321 return zip(ids_in_page, titles_in_page)
322
323
061a75ed
S
324class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
325 def _process_page(self, content):
6dee688e
S
326 for playlist_id in orderedSet(re.findall(
327 r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',
328 content)):
061a75ed
S
329 yield self.url_result(
330 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
331
0c148415
S
332 def _real_extract(self, url):
333 playlist_id = self._match_id(url)
334 webpage = self._download_webpage(url, playlist_id)
0c148415 335 title = self._og_search_title(webpage, fatal=False)
061a75ed 336 return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
0c148415
S
337
338
360e1ca5 339class YoutubeIE(YoutubeBaseInfoExtractor):
78caa52a 340 IE_DESC = 'YouTube.com'
cb7dfeea 341 _VALID_URL = r"""(?x)^
c5e8d7af 342 (
edb53e2d 343 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 344 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 345 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 346 (?:www\.)?pwnyoutube\.com/|
8b561bfc 347 (?:www\.)?hooktube\.com/|
f7000f3a 348 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
349 tube\.majestyc\.net/|
350 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
351 (?:.*?\#/)? # handle anchor (#/) redirect urls
352 (?: # the various things that can precede the ID:
ac7553d0 353 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 354 |(?: # or the v= param in all its forms
f7000f3a 355 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af 356 (?:\?|\#!?) # the params delimiter ? or # or #!
040ac686 357 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
c5e8d7af
PH
358 v=
359 )
f4b05232 360 ))
cbaed4bb
S
361 |(?:
362 youtu\.be| # just youtu.be/xxxx
6d4fc66b
S
363 vid\.plus| # or vid.plus/xxxx
364 zwearz\.com/watch| # or zwearz.com/watch/xxxx
cbaed4bb 365 )/
edb53e2d 366 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 367 )
c5e8d7af 368 )? # all until now is optional -> you can pass the naked ID
8963d9c2 369 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
d0ba5587
S
370 (?!.*?\blist=
371 (?:
372 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
373 WL # WL are handled by the watch later IE
374 )
375 )
c5e8d7af 376 (?(1).+)? # if we found the ID, everything can follow
d0ba5587 377 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
c5e8d7af 378 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26 379 _formats = {
c2d3cb4c 380 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
381 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
382 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
383 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
384 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
385 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
386 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
387 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
3834d3e3 388 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
c2d3cb4c 389 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
390 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
391 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
392 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
393 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
394 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
e1a0bfdf 395 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
c2d3cb4c 396 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
397 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
e1a0bfdf 398
399
400 # 3D videos
c2d3cb4c 401 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
402 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
403 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
404 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
e1a0bfdf 405 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
406 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
407 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
836a086c 408
96fb5605 409 # Apple HTTP Live Streaming
11f12195 410 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
c2d3cb4c 411 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
412 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
413 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
414 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
415 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
e1a0bfdf 416 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
417 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
2c62dc26
PH
418
419 # DASH mp4 video
d23028a8
S
420 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
421 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
422 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
423 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
424 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
425 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
426 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
427 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
428 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
429 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
430 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
431 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
836a086c 432
f6f1fc92 433 # Dash mp4 audio
d23028a8
S
434 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
435 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
436 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
437 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
438 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
439 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
440 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
836a086c
AZ
441
442 # Dash webm
d23028a8
S
443 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
444 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
445 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
446 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
447 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
448 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
449 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
450 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
451 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
452 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
453 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
454 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
455 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
456 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
457 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
4c6b4764 458 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
d23028a8
S
459 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
460 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
461 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
462 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
463 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
464 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
2c62dc26
PH
465
466 # Dash webm audio
d23028a8
S
467 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
468 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
ce6b9a2d 469
0857baad 470 # Dash webm audio with opus inside
d23028a8
S
471 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
472 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
473 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
0857baad 474
ce6b9a2d
PH
475 # RTMP (unnamed)
476 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 477 }
23d17e4b 478 _SUBTITLE_FORMATS = ('ttml', 'vtt')
836a086c 479
fd5c4aab
S
480 _GEO_BYPASS = False
481
78caa52a 482 IE_NAME = 'youtube'
2eb88d95
PH
483 _TESTS = [
484 {
2d3d2997 485 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
4bc3a23e
PH
486 'info_dict': {
487 'id': 'BaW_jenozKc',
488 'ext': 'mp4',
489 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
490 'uploader': 'Philipp Hagemeister',
491 'uploader_id': 'phihag',
ec85ded8 492 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
dd4c4492
S
493 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
494 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
4bc3a23e 495 'upload_date': '20121002',
7caf9830 496 'license': 'Standard YouTube License',
4bc3a23e
PH
497 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
498 'categories': ['Science & Technology'],
000b6b5a 499 'tags': ['youtube-dl'],
556dbe7f 500 'duration': 10,
3e7c1224
PH
501 'like_count': int,
502 'dislike_count': int,
7c80519c 503 'start_time': 1,
297a564b 504 'end_time': 9,
2eb88d95 505 }
0e853ca4 506 },
0e853ca4 507 {
2d3d2997 508 'url': 'https://www.youtube.com/watch?v=UxxajLWwzqY',
4bc3a23e
PH
509 'note': 'Test generic use_cipher_signature video (#897)',
510 'info_dict': {
511 'id': 'UxxajLWwzqY',
512 'ext': 'mp4',
513 'upload_date': '20120506',
514 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
0cb58b02 515 'alt_title': 'I Love It (feat. Charli XCX)',
7caf9830 516 'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8',
000b6b5a
S
517 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
518 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
519 'iconic ep', 'iconic', 'love', 'it'],
556dbe7f 520 'duration': 180,
4bc3a23e
PH
521 'uploader': 'Icona Pop',
522 'uploader_id': 'IconaPop',
ec85ded8 523 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop',
7caf9830 524 'license': 'Standard YouTube License',
0cb58b02 525 'creator': 'Icona Pop',
936784b2
S
526 'track': 'I Love It (feat. Charli XCX)',
527 'artist': 'Icona Pop',
2eb88d95 528 }
c108eb73
JMF
529 },
530 {
4bc3a23e
PH
531 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
532 'note': 'Test VEVO video with age protection (#956)',
533 'info_dict': {
534 'id': '07FYdnEawAQ',
535 'ext': 'mp4',
536 'upload_date': '20130703',
537 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
0cb58b02 538 'alt_title': 'Tunnel Vision',
4bc3a23e 539 'description': 'md5:64249768eec3bc4276236606ea996373',
556dbe7f 540 'duration': 419,
4bc3a23e
PH
541 'uploader': 'justintimberlakeVEVO',
542 'uploader_id': 'justintimberlakeVEVO',
ec85ded8 543 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',
7caf9830 544 'license': 'Standard YouTube License',
0cb58b02 545 'creator': 'Justin Timberlake',
7e72694b 546 'track': 'Tunnel Vision',
936784b2 547 'artist': 'Justin Timberlake',
34952f09 548 'age_limit': 18,
c108eb73
JMF
549 }
550 },
fccd3771 551 {
4bc3a23e
PH
552 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
553 'note': 'Embed-only video (#1746)',
554 'info_dict': {
555 'id': 'yZIXLfi8CZQ',
556 'ext': 'mp4',
557 'upload_date': '20120608',
558 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
559 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
560 'uploader': 'SET India',
94bfcd23 561 'uploader_id': 'setindia',
ec85ded8 562 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
7caf9830 563 'license': 'Standard YouTube License',
94bfcd23 564 'age_limit': 18,
fccd3771
PH
565 }
566 },
11b56058 567 {
2d3d2997 568 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
11b56058
PM
569 'note': 'Use the first video ID in the URL',
570 'info_dict': {
571 'id': 'BaW_jenozKc',
572 'ext': 'mp4',
573 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
574 'uploader': 'Philipp Hagemeister',
575 'uploader_id': 'phihag',
ec85ded8 576 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
11b56058 577 'upload_date': '20121002',
7caf9830 578 'license': 'Standard YouTube License',
11b56058
PM
579 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
580 'categories': ['Science & Technology'],
581 'tags': ['youtube-dl'],
556dbe7f 582 'duration': 10,
11b56058
PM
583 'like_count': int,
584 'dislike_count': int,
34a7de29
S
585 },
586 'params': {
587 'skip_download': True,
588 },
11b56058 589 },
dd27fd17 590 {
2d3d2997 591 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
4bc3a23e
PH
592 'note': '256k DASH audio (format 141) via DASH manifest',
593 'info_dict': {
594 'id': 'a9LDPn-MO4I',
595 'ext': 'm4a',
596 'upload_date': '20121002',
597 'uploader_id': '8KVIDEO',
ec85ded8 598 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
4bc3a23e
PH
599 'description': '',
600 'uploader': '8KVIDEO',
7caf9830 601 'license': 'Standard YouTube License',
4bc3a23e 602 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 603 },
4bc3a23e
PH
604 'params': {
605 'youtube_include_dash_manifest': True,
606 'format': '141',
4919603f 607 },
de3c7fe0 608 'skip': 'format 141 not served anymore',
dd27fd17 609 },
3489b7d2
JMF
610 # DASH manifest with encrypted signature
611 {
78caa52a
PH
612 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
613 'info_dict': {
614 'id': 'IB3lcPjvWLA',
615 'ext': 'm4a',
b766eb27 616 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
eb6793ba 617 'description': 'md5:1900ed86ee514927b9e00fbead6969a5',
556dbe7f 618 'duration': 244,
78caa52a
PH
619 'uploader': 'AfrojackVEVO',
620 'uploader_id': 'AfrojackVEVO',
621 'upload_date': '20131011',
7caf9830 622 'license': 'Standard YouTube License',
3489b7d2 623 },
4bc3a23e 624 'params': {
78caa52a 625 'youtube_include_dash_manifest': True,
de3c7fe0 626 'format': '141/bestaudio[ext=m4a]',
3489b7d2
JMF
627 },
628 },
aaeb86f6
S
629 # JS player signature function name containing $
630 {
631 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
632 'info_dict': {
633 'id': 'nfWlot6h_JM',
634 'ext': 'm4a',
635 'title': 'Taylor Swift - Shake It Off',
0cb58b02 636 'alt_title': 'Shake It Off',
f57b7835 637 'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3',
556dbe7f 638 'duration': 242,
aaeb86f6
S
639 'uploader': 'TaylorSwiftVEVO',
640 'uploader_id': 'TaylorSwiftVEVO',
641 'upload_date': '20140818',
7caf9830 642 'license': 'Standard YouTube License',
0cb58b02 643 'creator': 'Taylor Swift',
aaeb86f6
S
644 },
645 'params': {
646 'youtube_include_dash_manifest': True,
de3c7fe0 647 'format': '141/bestaudio[ext=m4a]',
aaeb86f6
S
648 },
649 },
aa79ac0c
PH
650 # Controversy video
651 {
652 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
653 'info_dict': {
654 'id': 'T4XJQO3qol8',
655 'ext': 'mp4',
556dbe7f 656 'duration': 219,
aa79ac0c 657 'upload_date': '20100909',
eb6793ba 658 'uploader': 'TJ Kirk',
aa79ac0c 659 'uploader_id': 'TheAmazingAtheist',
ec85ded8 660 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
7caf9830 661 'license': 'Standard YouTube License',
aa79ac0c
PH
662 'title': 'Burning Everyone\'s Koran',
663 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
664 }
c522adb1
JMF
665 },
666 # Normal age-gate video (No vevo, embed allowed)
667 {
2d3d2997 668 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
c522adb1
JMF
669 'info_dict': {
670 'id': 'HtVdAasjOgU',
671 'ext': 'mp4',
672 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
ec85ded8 673 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
556dbe7f 674 'duration': 142,
c522adb1
JMF
675 'uploader': 'The Witcher',
676 'uploader_id': 'WitcherGame',
ec85ded8 677 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
c522adb1 678 'upload_date': '20140605',
7caf9830 679 'license': 'Standard YouTube License',
34952f09 680 'age_limit': 18,
c522adb1
JMF
681 },
682 },
fccae2b9
S
683 # Age-gate video with encrypted signature
684 {
2d3d2997 685 'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU',
fccae2b9
S
686 'info_dict': {
687 'id': '6kLq3WMV1nU',
eb6793ba 688 'ext': 'webm',
fccae2b9
S
689 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
690 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
eb6793ba 691 'duration': 246,
fccae2b9
S
692 'uploader': 'LloydVEVO',
693 'uploader_id': 'LloydVEVO',
ec85ded8 694 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO',
fccae2b9 695 'upload_date': '20110629',
7caf9830 696 'license': 'Standard YouTube License',
34952f09 697 'age_limit': 18,
fccae2b9
S
698 },
699 },
774e208f 700 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
7d02dcfa 701 # YouTube Red ad is not captured for creator
774e208f
PH
702 {
703 'url': '__2ABJjxzNo',
704 'info_dict': {
705 'id': '__2ABJjxzNo',
706 'ext': 'mp4',
556dbe7f 707 'duration': 266,
774e208f
PH
708 'upload_date': '20100430',
709 'uploader_id': 'deadmau5',
ec85ded8 710 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
0cb58b02 711 'creator': 'deadmau5',
774e208f
PH
712 'description': 'md5:12c56784b8032162bb936a5f76d55360',
713 'uploader': 'deadmau5',
7caf9830 714 'license': 'Standard YouTube License',
774e208f 715 'title': 'Deadmau5 - Some Chords (HD)',
0cb58b02 716 'alt_title': 'Some Chords',
774e208f
PH
717 },
718 'expected_warnings': [
719 'DASH manifest missing',
720 ]
e52a40ab
PH
721 },
722 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
723 {
724 'url': 'lqQg6PlCWgI',
725 'info_dict': {
726 'id': 'lqQg6PlCWgI',
727 'ext': 'mp4',
556dbe7f 728 'duration': 6085,
90227264 729 'upload_date': '20150827',
cbe2bd91 730 'uploader_id': 'olympic',
ec85ded8 731 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
7caf9830 732 'license': 'Standard YouTube License',
cbe2bd91 733 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
be49068d 734 'uploader': 'Olympic',
cbe2bd91
PH
735 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
736 },
737 'params': {
738 'skip_download': 'requires avconv',
e52a40ab 739 }
cbe2bd91 740 },
6271f1ca
PH
741 # Non-square pixels
742 {
743 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
744 'info_dict': {
745 'id': '_b-2C3KPAM0',
746 'ext': 'mp4',
747 'stretched_ratio': 16 / 9.,
556dbe7f 748 'duration': 85,
6271f1ca
PH
749 'upload_date': '20110310',
750 'uploader_id': 'AllenMeow',
ec85ded8 751 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
6271f1ca 752 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
eb6793ba 753 'uploader': '孫ᄋᄅ',
7caf9830 754 'license': 'Standard YouTube License',
6271f1ca
PH
755 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
756 },
06b491eb
S
757 },
758 # url_encoded_fmt_stream_map is empty string
759 {
760 'url': 'qEJwOuvDf7I',
761 'info_dict': {
762 'id': 'qEJwOuvDf7I',
f57b7835 763 'ext': 'webm',
06b491eb
S
764 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
765 'description': '',
766 'upload_date': '20150404',
767 'uploader_id': 'spbelect',
768 'uploader': 'Наблюдатели Петербурга',
769 },
770 'params': {
771 'skip_download': 'requires avconv',
e323cf3f
S
772 },
773 'skip': 'This live event has ended.',
06b491eb 774 },
da77d856
S
775 # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)
776 {
777 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
778 'info_dict': {
779 'id': 'FIl7x6_3R5Y',
eb6793ba 780 'ext': 'webm',
da77d856
S
781 'title': 'md5:7b81415841e02ecd4313668cde88737a',
782 'description': 'md5:116377fd2963b81ec4ce64b542173306',
556dbe7f 783 'duration': 220,
da77d856
S
784 'upload_date': '20150625',
785 'uploader_id': 'dorappi2000',
ec85ded8 786 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
da77d856 787 'uploader': 'dorappi2000',
7caf9830 788 'license': 'Standard YouTube License',
eb6793ba 789 'formats': 'mincount:31',
da77d856 790 },
eb6793ba 791 'skip': 'not actual anymore',
2ee8f5d8 792 },
8a1a26ce
YCH
793 # DASH manifest with segment_list
794 {
795 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
796 'md5': '8ce563a1d667b599d21064e982ab9e31',
797 'info_dict': {
798 'id': 'CsmdDsKjzN8',
799 'ext': 'mp4',
17ee98e1 800 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
8a1a26ce
YCH
801 'uploader': 'Airtek',
802 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
803 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
7caf9830 804 'license': 'Standard YouTube License',
8a1a26ce
YCH
805 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
806 },
807 'params': {
808 'youtube_include_dash_manifest': True,
809 'format': '135', # bestvideo
be49068d
S
810 },
811 'skip': 'This live event has ended.',
2ee8f5d8 812 },
cf7e015f
S
813 {
814 # Multifeed videos (multiple cameras), URL is for Main Camera
815 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
816 'info_dict': {
817 'id': 'jqWvoWXjCVs',
818 'title': 'teamPGP: Rocket League Noob Stream',
819 'description': 'md5:dc7872fb300e143831327f1bae3af010',
820 },
821 'playlist': [{
822 'info_dict': {
823 'id': 'jqWvoWXjCVs',
824 'ext': 'mp4',
825 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
826 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 827 'duration': 7335,
cf7e015f
S
828 'upload_date': '20150721',
829 'uploader': 'Beer Games Beer',
830 'uploader_id': 'beergamesbeer',
ec85ded8 831 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 832 'license': 'Standard YouTube License',
cf7e015f
S
833 },
834 }, {
835 'info_dict': {
836 'id': '6h8e8xoXJzg',
837 'ext': 'mp4',
838 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
839 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 840 'duration': 7337,
cf7e015f
S
841 'upload_date': '20150721',
842 'uploader': 'Beer Games Beer',
843 'uploader_id': 'beergamesbeer',
ec85ded8 844 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 845 'license': 'Standard YouTube License',
cf7e015f
S
846 },
847 }, {
848 'info_dict': {
849 'id': 'PUOgX5z9xZw',
850 'ext': 'mp4',
851 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
852 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 853 'duration': 7337,
cf7e015f
S
854 'upload_date': '20150721',
855 'uploader': 'Beer Games Beer',
856 'uploader_id': 'beergamesbeer',
ec85ded8 857 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 858 'license': 'Standard YouTube License',
cf7e015f
S
859 },
860 }, {
861 'info_dict': {
862 'id': 'teuwxikvS5k',
863 'ext': 'mp4',
864 'title': 'teamPGP: Rocket League Noob Stream (zim)',
865 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 866 'duration': 7334,
cf7e015f
S
867 'upload_date': '20150721',
868 'uploader': 'Beer Games Beer',
869 'uploader_id': 'beergamesbeer',
ec85ded8 870 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 871 'license': 'Standard YouTube License',
cf7e015f
S
872 },
873 }],
874 'params': {
875 'skip_download': True,
876 },
cbaed4bb 877 },
f9f49d87
S
878 {
879 # Multifeed video with comma in title (see https://github.com/rg3/youtube-dl/issues/8536)
880 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
881 'info_dict': {
882 'id': 'gVfLd0zydlo',
883 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
884 },
885 'playlist_count': 2,
be49068d 886 'skip': 'Not multifeed anymore',
f9f49d87 887 },
cbaed4bb 888 {
2d3d2997 889 'url': 'https://vid.plus/FlRa-iH7PGw',
cbaed4bb 890 'only_matching': True,
0e49d9a6 891 },
6d4fc66b 892 {
2d3d2997 893 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
6d4fc66b
S
894 'only_matching': True,
895 },
0e49d9a6 896 {
61f92af1 897 # Title with JS-like syntax "};" (see https://github.com/rg3/youtube-dl/issues/7468)
a8776b10
S
898 # Also tests cut-off URL expansion in video description (see
899 # https://github.com/rg3/youtube-dl/issues/1892,
900 # https://github.com/rg3/youtube-dl/issues/8164)
0e49d9a6
LL
901 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
902 'info_dict': {
903 'id': 'lsguqyKfVQg',
904 'ext': 'mp4',
905 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
eb6793ba 906 'alt_title': 'Dark Walk - Position Music',
0e49d9a6 907 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
556dbe7f 908 'duration': 133,
0e49d9a6
LL
909 'upload_date': '20151119',
910 'uploader_id': 'IronSoulElf',
ec85ded8 911 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
0e49d9a6 912 'uploader': 'IronSoulElf',
7caf9830 913 'license': 'Standard YouTube License',
eb6793ba
S
914 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
915 'track': 'Dark Walk - Position Music',
916 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
0e49d9a6
LL
917 },
918 'params': {
919 'skip_download': True,
920 },
921 },
61f92af1
S
922 {
923 # Tags with '};' (see https://github.com/rg3/youtube-dl/issues/7468)
924 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
925 'only_matching': True,
926 },
313dfc45
LL
927 {
928 # Video with yt:stretch=17:0
929 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
930 'info_dict': {
931 'id': 'Q39EVAstoRM',
932 'ext': 'mp4',
933 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
934 'description': 'md5:ee18a25c350637c8faff806845bddee9',
935 'upload_date': '20151107',
936 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
937 'uploader': 'CH GAMER DROID',
938 },
939 'params': {
940 'skip_download': True,
941 },
be49068d 942 'skip': 'This video does not exist.',
313dfc45 943 },
7caf9830
S
944 {
945 # Video licensed under Creative Commons
946 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
947 'info_dict': {
948 'id': 'M4gD1WSo5mA',
949 'ext': 'mp4',
950 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
951 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
556dbe7f 952 'duration': 721,
7caf9830
S
953 'upload_date': '20150127',
954 'uploader_id': 'BerkmanCenter',
ec85ded8 955 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
556dbe7f 956 'uploader': 'The Berkman Klein Center for Internet & Society',
7caf9830
S
957 'license': 'Creative Commons Attribution license (reuse allowed)',
958 },
959 'params': {
960 'skip_download': True,
961 },
962 },
fd050249
S
963 {
964 # Channel-like uploader_url
965 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
966 'info_dict': {
967 'id': 'eQcmzGIKrzg',
968 'ext': 'mp4',
969 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
970 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
556dbe7f 971 'duration': 4060,
fd050249 972 'upload_date': '20151119',
eb6793ba 973 'uploader': 'Bernie Sanders',
fd050249 974 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
ec85ded8 975 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
fd050249
S
976 'license': 'Creative Commons Attribution license (reuse allowed)',
977 },
978 'params': {
979 'skip_download': True,
980 },
981 },
040ac686
S
982 {
983 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
984 'only_matching': True,
7f29cf54
S
985 },
986 {
987 # YouTube Red paid video (https://github.com/rg3/youtube-dl/issues/10059)
988 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
989 'only_matching': True,
6496ccb4
S
990 },
991 {
992 # Rental video preview
993 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
994 'info_dict': {
995 'id': 'uGpuVWrhIzE',
996 'ext': 'mp4',
997 'title': 'Piku - Trailer',
998 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
999 'upload_date': '20150811',
1000 'uploader': 'FlixMatrix',
1001 'uploader_id': 'FlixMatrixKaravan',
ec85ded8 1002 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
6496ccb4
S
1003 'license': 'Standard YouTube License',
1004 },
1005 'params': {
1006 'skip_download': True,
1007 },
eb6793ba 1008 'skip': 'This video is not available.',
022a5d66 1009 },
12afdc2a
S
1010 {
1011 # YouTube Red video with episode data
1012 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
1013 'info_dict': {
1014 'id': 'iqKdEhx-dD4',
1015 'ext': 'mp4',
1016 'title': 'Isolation - Mind Field (Ep 1)',
eb6793ba 1017 'description': 'md5:25b78d2f64ae81719f5c96319889b736',
556dbe7f 1018 'duration': 2085,
12afdc2a
S
1019 'upload_date': '20170118',
1020 'uploader': 'Vsauce',
1021 'uploader_id': 'Vsauce',
1022 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
1023 'license': 'Standard YouTube License',
1024 'series': 'Mind Field',
1025 'season_number': 1,
1026 'episode_number': 1,
1027 },
1028 'params': {
1029 'skip_download': True,
1030 },
1031 'expected_warnings': [
1032 'Skipping DASH manifest',
1033 ],
1034 },
c7121fa7
S
1035 {
1036 # The following content has been identified by the YouTube community
1037 # as inappropriate or offensive to some audiences.
1038 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
1039 'info_dict': {
1040 'id': '6SJNVb0GnPI',
1041 'ext': 'mp4',
1042 'title': 'Race Differences in Intelligence',
1043 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
1044 'duration': 965,
1045 'upload_date': '20140124',
1046 'uploader': 'New Century Foundation',
1047 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
1048 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
1049 'license': 'Standard YouTube License',
c7121fa7
S
1050 },
1051 'params': {
1052 'skip_download': True,
1053 },
1054 },
022a5d66
S
1055 {
1056 # itag 212
1057 'url': '1t24XAntNCY',
1058 'only_matching': True,
fd5c4aab
S
1059 },
1060 {
1061 # geo restricted to JP
1062 'url': 'sJL6WA-aGkQ',
1063 'only_matching': True,
1064 },
d0ba5587
S
1065 {
1066 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
1067 'only_matching': True,
1068 },
2eb88d95
PH
1069 ]
1070
e0df6211
PH
1071 def __init__(self, *args, **kwargs):
1072 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 1073 self._player_cache = {}
e0df6211 1074
c5e8d7af
PH
1075 def report_video_info_webpage_download(self, video_id):
1076 """Report attempt to download video info webpage."""
69ea8ca4 1077 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 1078
c5e8d7af
PH
1079 def report_information_extraction(self, video_id):
1080 """Report attempt to extract video information."""
69ea8ca4 1081 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
1082
1083 def report_unavailable_format(self, video_id, format):
1084 """Report extracted video URL."""
69ea8ca4 1085 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
1086
1087 def report_rtmp_download(self):
1088 """Indicate the download will use the RTMP protocol."""
69ea8ca4 1089 self.to_screen('RTMP download detected')
c5e8d7af 1090
60064c53
PH
1091 def _signature_cache_id(self, example_sig):
1092 """ Return a string representation of a signature """
78caa52a 1093 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
1094
1095 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 1096 id_m = re.match(
e31fed95 1097 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$',
cf010131 1098 player_url)
c081b35c
PH
1099 if not id_m:
1100 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
1101 player_type = id_m.group('ext')
1102 player_id = id_m.group('id')
1103
c4417ddb 1104 # Read from filesystem cache
60064c53
PH
1105 func_id = '%s_%s_%s' % (
1106 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 1107 assert os.path.basename(func_id) == func_id
a0e07d31 1108
69ea8ca4 1109 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 1110 if cache_spec is not None:
78caa52a 1111 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 1112
6d1a55a5
PH
1113 download_note = (
1114 'Downloading player %s' % player_url
1115 if self._downloader.params.get('verbose') else
1116 'Downloading %s player %s' % (player_type, player_id)
1117 )
e0df6211
PH
1118 if player_type == 'js':
1119 code = self._download_webpage(
1120 player_url, video_id,
6d1a55a5 1121 note=download_note,
69ea8ca4 1122 errnote='Download of %s failed' % player_url)
83799698 1123 res = self._parse_sig_js(code)
c4417ddb 1124 elif player_type == 'swf':
e0df6211
PH
1125 urlh = self._request_webpage(
1126 player_url, video_id,
6d1a55a5 1127 note=download_note,
69ea8ca4 1128 errnote='Download of %s failed' % player_url)
e0df6211 1129 code = urlh.read()
83799698 1130 res = self._parse_sig_swf(code)
e0df6211
PH
1131 else:
1132 assert False, 'Invalid player type %r' % player_type
1133
785521bf
PH
1134 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1135 cache_res = res(test_string)
1136 cache_spec = [ord(c) for c in cache_res]
83799698 1137
69ea8ca4 1138 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
1139 return res
1140
60064c53 1141 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
1142 def gen_sig_code(idxs):
1143 def _genslice(start, end, step):
78caa52a 1144 starts = '' if start == 0 else str(start)
8bcc8756 1145 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 1146 steps = '' if step == 1 else (':%d' % step)
78caa52a 1147 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
1148
1149 step = None
7af808a5
PH
1150 # Quelch pyflakes warnings - start will be set when step is set
1151 start = '(Never used)'
edf3e38e
PH
1152 for i, prev in zip(idxs[1:], idxs[:-1]):
1153 if step is not None:
1154 if i - prev == step:
1155 continue
1156 yield _genslice(start, prev, step)
1157 step = None
1158 continue
1159 if i - prev in [-1, 1]:
1160 step = i - prev
1161 start = prev
1162 continue
1163 else:
78caa52a 1164 yield 's[%d]' % prev
edf3e38e 1165 if step is None:
78caa52a 1166 yield 's[%d]' % i
edf3e38e
PH
1167 else:
1168 yield _genslice(start, i, step)
1169
78caa52a 1170 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 1171 cache_res = func(test_string)
edf3e38e 1172 cache_spec = [ord(c) for c in cache_res]
78caa52a 1173 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
1174 signature_id_tuple = '(%s)' % (
1175 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 1176 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 1177 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 1178 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 1179
e0df6211
PH
1180 def _parse_sig_js(self, jscode):
1181 funcname = self._search_regex(
3c90cc8b 1182 (r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
9a47fa35
S
1183 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
1184 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1185 r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\('),
3c90cc8b 1186 jscode, 'Initial JS player signature function name', group='sig')
2b25cb5d
PH
1187
1188 jsi = JSInterpreter(jscode)
1189 initial_function = jsi.extract_function(funcname)
e0df6211
PH
1190 return lambda s: initial_function([s])
1191
1192 def _parse_sig_swf(self, file_contents):
54256267 1193 swfi = SWFInterpreter(file_contents)
78caa52a 1194 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 1195 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 1196 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
1197 return lambda s: initial_function([s])
1198
83799698 1199 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 1200 """Turn the encrypted s field into a working signature"""
6b37f0be 1201
c8bf86d5 1202 if player_url is None:
69ea8ca4 1203 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 1204
69ea8ca4 1205 if player_url.startswith('//'):
78caa52a 1206 player_url = 'https:' + player_url
3c90cc8b
S
1207 elif not re.match(r'https?://', player_url):
1208 player_url = compat_urlparse.urljoin(
1209 'https://www.youtube.com', player_url)
c8bf86d5 1210 try:
62af3a0e 1211 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
1212 if player_id not in self._player_cache:
1213 func = self._extract_signature_function(
60064c53 1214 video_id, player_url, s
c8bf86d5
PH
1215 )
1216 self._player_cache[player_id] = func
1217 func = self._player_cache[player_id]
1218 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 1219 self._print_sig_code(func, s)
c8bf86d5
PH
1220 return func(s)
1221 except Exception as e:
1222 tb = traceback.format_exc()
1223 raise ExtractorError(
78caa52a 1224 'Signature extraction failed: ' + tb, cause=e)
e0df6211 1225
360e1ca5 1226 def _get_subtitles(self, video_id, webpage):
de7f3446 1227 try:
60e47a26 1228 subs_doc = self._download_xml(
38c2e5b8 1229 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
1230 video_id, note=False)
1231 except ExtractorError as err:
9b9c5355 1232 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
de7f3446 1233 return {}
de7f3446
JMF
1234
1235 sub_lang_list = {}
60e47a26
JMF
1236 for track in subs_doc.findall('track'):
1237 lang = track.attrib['lang_code']
7e660ac1
LD
1238 if lang in sub_lang_list:
1239 continue
360e1ca5 1240 sub_formats = []
23d17e4b 1241 for ext in self._SUBTITLE_FORMATS:
15707c7e 1242 params = compat_urllib_parse_urlencode({
360e1ca5
JMF
1243 'lang': lang,
1244 'v': video_id,
1245 'fmt': ext,
1246 'name': track.attrib['name'].encode('utf-8'),
1247 })
1248 sub_formats.append({
1249 'url': 'https://www.youtube.com/api/timedtext?' + params,
1250 'ext': ext,
1251 })
1252 sub_lang_list[lang] = sub_formats
de7f3446 1253 if not sub_lang_list:
69ea8ca4 1254 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
1255 return {}
1256 return sub_lang_list
1257
a72778d3
S
1258 def _get_ytplayer_config(self, video_id, webpage):
1259 patterns = (
526b3b07
S
1260 # User data may contain arbitrary character sequences that may affect
1261 # JSON extraction with regex, e.g. when '};' is contained the second
1262 # regex won't capture the whole JSON. Yet working around by trying more
1263 # concrete regex first keeping in mind proper quoted string handling
1264 # to be implemented in future that will replace this workaround (see
1265 # https://github.com/rg3/youtube-dl/issues/7468,
1266 # https://github.com/rg3/youtube-dl/pull/7599)
a72778d3
S
1267 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1268 r';ytplayer\.config\s*=\s*({.+?});',
1269 )
1270 config = self._search_regex(
1271 patterns, webpage, 'ytplayer.config', default=None)
1272 if config:
1273 return self._parse_json(
1274 uppercase_escape(config), video_id, fatal=False)
0e49d9a6 1275
360e1ca5 1276 def _get_automatic_captions(self, video_id, webpage):
de7f3446
JMF
1277 """We need the webpage for getting the captions url, pass it as an
1278 argument to speed up the process."""
69ea8ca4 1279 self.to_screen('%s: Looking for automatic captions' % video_id)
a72778d3 1280 player_config = self._get_ytplayer_config(video_id, webpage)
78caa52a 1281 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
a72778d3 1282 if not player_config:
de7f3446
JMF
1283 self._downloader.report_warning(err_msg)
1284 return {}
de7f3446 1285 try:
0792d563 1286 args = player_config['args']
b78b292f
S
1287 caption_url = args.get('ttsurl')
1288 if caption_url:
1289 timestamp = args['timestamp']
1290 # We get the available subtitles
15707c7e 1291 list_params = compat_urllib_parse_urlencode({
b78b292f
S
1292 'type': 'list',
1293 'tlangs': 1,
1294 'asrs': 1,
1295 })
1296 list_url = caption_url + '&' + list_params
1297 caption_list = self._download_xml(list_url, video_id)
1298 original_lang_node = caption_list.find('track')
1299 if original_lang_node is None:
1300 self._downloader.report_warning('Video doesn\'t have automatic captions')
1301 return {}
1302 original_lang = original_lang_node.attrib['lang_code']
1303 caption_kind = original_lang_node.attrib.get('kind', '')
1304
1305 sub_lang_list = {}
1306 for lang_node in caption_list.findall('target'):
1307 sub_lang = lang_node.attrib['lang_code']
1308 sub_formats = []
1309 for ext in self._SUBTITLE_FORMATS:
15707c7e 1310 params = compat_urllib_parse_urlencode({
b78b292f
S
1311 'lang': original_lang,
1312 'tlang': sub_lang,
1313 'fmt': ext,
1314 'ts': timestamp,
1315 'kind': caption_kind,
1316 })
1317 sub_formats.append({
1318 'url': caption_url + '&' + params,
1319 'ext': ext,
1320 })
1321 sub_lang_list[sub_lang] = sub_formats
1322 return sub_lang_list
1323
ddbb4c5c
S
1324 def make_captions(sub_url, sub_langs):
1325 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1326 caption_qs = compat_parse_qs(parsed_sub_url.query)
1327 captions = {}
1328 for sub_lang in sub_langs:
1329 sub_formats = []
1330 for ext in self._SUBTITLE_FORMATS:
1331 caption_qs.update({
1332 'tlang': [sub_lang],
1333 'fmt': [ext],
1334 })
1335 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1336 query=compat_urllib_parse_urlencode(caption_qs, True)))
1337 sub_formats.append({
1338 'url': sub_url,
1339 'ext': ext,
1340 })
1341 captions[sub_lang] = sub_formats
1342 return captions
1343
1344 # New captions format as of 22.06.2017
1345 player_response = args.get('player_response')
1346 if player_response and isinstance(player_response, compat_str):
1347 player_response = self._parse_json(
1348 player_response, video_id, fatal=False)
1349 if player_response:
1350 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1351 base_url = renderer['captionTracks'][0]['baseUrl']
1352 sub_lang_list = []
1353 for lang in renderer['translationLanguages']:
1354 lang_code = lang.get('languageCode')
1355 if lang_code:
1356 sub_lang_list.append(lang_code)
1357 return make_captions(base_url, sub_lang_list)
1358
b78b292f
S
1359 # Some videos don't provide ttsurl but rather caption_tracks and
1360 # caption_translation_languages (e.g. 20LmZk1hakA)
ddbb4c5c 1361 # Does not used anymore as of 22.06.2017
b78b292f
S
1362 caption_tracks = args['caption_tracks']
1363 caption_translation_languages = args['caption_translation_languages']
1364 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
ddbb4c5c 1365 sub_lang_list = []
b78b292f
S
1366 for lang in caption_translation_languages.split(','):
1367 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1368 sub_lang = lang_qs.get('lc', [None])[0]
ddbb4c5c
S
1369 if sub_lang:
1370 sub_lang_list.append(sub_lang)
1371 return make_captions(caption_url, sub_lang_list)
de7f3446
JMF
1372 # An extractor error can be raise by the download process if there are
1373 # no automatic captions but there are subtitles
ddbb4c5c 1374 except (KeyError, IndexError, ExtractorError):
de7f3446
JMF
1375 self._downloader.report_warning(err_msg)
1376 return {}
1377
d77ab8e2
S
1378 def _mark_watched(self, video_id, video_info):
1379 playback_url = video_info.get('videostats_playback_base_url', [None])[0]
1380 if not playback_url:
1381 return
1382 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1383 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1384
1385 # cpn generation algorithm is reverse engineered from base.js.
1386 # In fact it works even with dummy cpn.
1387 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1388 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1389
1390 qs.update({
1391 'ver': ['2'],
1392 'cpn': [cpn],
1393 })
1394 playback_url = compat_urlparse.urlunparse(
15707c7e 1395 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
d77ab8e2
S
1396
1397 self._download_webpage(
1398 playback_url, video_id, 'Marking watched',
1399 'Unable to mark watched', fatal=False)
1400
66c9fa36
S
1401 @staticmethod
1402 def _extract_urls(webpage):
1403 # Embedded YouTube player
1404 entries = [
1405 unescapeHTML(mobj.group('url'))
1406 for mobj in re.finditer(r'''(?x)
1407 (?:
1408 <iframe[^>]+?src=|
1409 data-video-url=|
1410 <embed[^>]+?src=|
1411 embedSWF\(?:\s*|
1412 <object[^>]+data=|
1413 new\s+SWFObject\(
1414 )
1415 (["\'])
1416 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
f2332f18 1417 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
66c9fa36
S
1418 \1''', webpage)]
1419
1420 # lazyYT YouTube embed
1421 entries.extend(list(map(
1422 unescapeHTML,
1423 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1424
1425 # Wordpress "YouTube Video Importer" plugin
1426 matches = re.findall(r'''(?x)<div[^>]+
1427 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1428 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1429 entries.extend(m[-1] for m in matches)
1430
1431 return entries
1432
1433 @staticmethod
1434 def _extract_url(webpage):
1435 urls = YoutubeIE._extract_urls(webpage)
1436 return urls[0] if urls else None
1437
97665381
PH
1438 @classmethod
1439 def extract_id(cls, url):
1440 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 1441 if mobj is None:
69ea8ca4 1442 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
1443 video_id = mobj.group(2)
1444 return video_id
1445
1fb07d10
JG
1446 def _extract_annotations(self, video_id):
1447 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 1448 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 1449
9cafc3fd
S
1450 @staticmethod
1451 def _extract_chapters(description, duration):
1452 if not description:
1453 return None
1454 chapter_lines = re.findall(
1455 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1456 description)
1457 if not chapter_lines:
1458 return None
1459 chapters = []
1460 for next_num, (chapter_line, time_point) in enumerate(
1461 chapter_lines, start=1):
1462 start_time = parse_duration(time_point)
1463 if start_time is None:
1464 continue
39d4c1be
S
1465 if start_time > duration:
1466 break
9cafc3fd
S
1467 end_time = (duration if next_num == len(chapter_lines)
1468 else parse_duration(chapter_lines[next_num][1]))
1469 if end_time is None:
1470 continue
39d4c1be
S
1471 if end_time > duration:
1472 end_time = duration
1473 if start_time > end_time:
1474 break
9cafc3fd
S
1475 chapter_title = re.sub(
1476 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1477 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1478 chapters.append({
1479 'start_time': start_time,
1480 'end_time': end_time,
1481 'title': chapter_title,
1482 })
1483 return chapters
1484
c5e8d7af 1485 def _real_extract(self, url):
cf7e015f
S
1486 url, smuggled_data = unsmuggle_url(url, {})
1487
7e8c0af0 1488 proto = (
78caa52a
PH
1489 'http' if self._downloader.params.get('prefer_insecure', False)
1490 else 'https')
7e8c0af0 1491
7c80519c 1492 start_time = None
297a564b 1493 end_time = None
7c80519c
JMF
1494 parsed_url = compat_urllib_parse_urlparse(url)
1495 for component in [parsed_url.fragment, parsed_url.query]:
1496 query = compat_parse_qs(component)
297a564b 1497 if start_time is None and 't' in query:
7c80519c 1498 start_time = parse_duration(query['t'][0])
2929fa0e
JMF
1499 if start_time is None and 'start' in query:
1500 start_time = parse_duration(query['start'][0])
297a564b
JMF
1501 if end_time is None and 'end' in query:
1502 end_time = parse_duration(query['end'][0])
7c80519c 1503
c5e8d7af
PH
1504 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1505 mobj = re.search(self._NEXT_URL_RE, url)
1506 if mobj:
7fd002c0 1507 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
97665381 1508 video_id = self.extract_id(url)
c5e8d7af
PH
1509
1510 # Get video webpage
aa79ac0c 1511 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
a1f934b1 1512 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
1513
1514 # Attempt to extract SWF player URL
e0df6211 1515 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1516 if mobj is not None:
1517 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1518 else:
1519 player_url = None
1520
d8d24a92
S
1521 dash_mpds = []
1522
1523 def add_dash_mpd(video_info):
1524 dash_mpd = video_info.get('dashmpd')
1525 if dash_mpd and dash_mpd[0] not in dash_mpds:
1526 dash_mpds.append(dash_mpd[0])
1527
c7121fa7
S
1528 is_live = None
1529 view_count = None
1530
1531 def extract_view_count(v_info):
1532 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1533
c5e8d7af 1534 # Get video info
6449cd80 1535 embed_webpage = None
c108eb73 1536 if re.search(r'player-age-gate-content">', video_webpage) is not None:
c108eb73
JMF
1537 age_gate = True
1538 # We simulate the access to the video from www.youtube.com/v/{video_id}
1539 # this can be viewed without login into Youtube
beb95e77
CL
1540 url = proto + '://www.youtube.com/embed/%s' % video_id
1541 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
15707c7e 1542 data = compat_urllib_parse_urlencode({
2c57c7fa
JMF
1543 'video_id': video_id,
1544 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934 1545 'sts': self._search_regex(
beb95e77 1546 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
2c57c7fa 1547 })
7e8c0af0 1548 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
94bd3613
PH
1549 video_info_webpage = self._download_webpage(
1550 video_info_url, video_id,
20436c30 1551 note='Refetching age-gated info webpage',
94bd3613 1552 errnote='unable to download video info webpage')
c5e8d7af 1553 video_info = compat_parse_qs(video_info_webpage)
d8d24a92 1554 add_dash_mpd(video_info)
c108eb73
JMF
1555 else:
1556 age_gate = False
bc93bdb5 1557 video_info = None
dc4e4f90 1558 sts = None
d8d24a92 1559 # Try looking directly into the video webpage
a72778d3
S
1560 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1561 if ytplayer_config:
4e62ebe2 1562 args = ytplayer_config['args']
4c76aa06 1563 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
d8d24a92
S
1564 # Convert to the same format returned by compat_parse_qs
1565 video_info = dict((k, [v]) for k, v in args.items())
1566 add_dash_mpd(video_info)
6496ccb4
S
1567 # Rental video is not rented but preview is available (e.g.
1568 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1569 # https://github.com/rg3/youtube-dl/issues/10532)
1570 if not video_info and args.get('ypc_vid'):
1571 return self.url_result(
1572 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
2fe1ff85
JMF
1573 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1574 is_live = True
dc4e4f90 1575 sts = ytplayer_config.get('sts')
0a3cf9ad
S
1576 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1577 # We also try looking in get_video_info since it may contain different dashmpd
1578 # URL that points to a DASH manifest with possibly different itag set (some itags
1579 # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
1580 # manifest pointed by get_video_info's dashmpd).
1581 # The general idea is to take a union of itags of both DASH manifests (for example
1582 # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
4e62ebe2 1583 self.report_video_info_webpage_download(video_id)
dc4e4f90
S
1584 for el in ('info', 'embedded', 'detailpage', 'vevo', ''):
1585 query = {
1586 'video_id': video_id,
1587 'ps': 'default',
1588 'eurl': '',
1589 'gl': 'US',
1590 'hl': 'en',
1591 }
1592 if el:
1593 query['el'] = el
1594 if sts:
1595 query['sts'] = sts
810fb84d 1596 video_info_webpage = self._download_webpage(
dc4e4f90 1597 '%s://www.youtube.com/get_video_info' % proto,
4e62ebe2 1598 video_id, note=False,
dc4e4f90
S
1599 errnote='unable to download video info webpage',
1600 fatal=False, query=query)
1601 if not video_info_webpage:
1602 continue
0a3cf9ad 1603 get_video_info = compat_parse_qs(video_info_webpage)
fd545fc6 1604 add_dash_mpd(get_video_info)
c7121fa7
S
1605 if view_count is None:
1606 view_count = extract_view_count(get_video_info)
0a3cf9ad
S
1607 if not video_info:
1608 video_info = get_video_info
1609 if 'token' in get_video_info:
89ea063e
S
1610 # Different get_video_info requests may report different results, e.g.
1611 # some may report video unavailability, but some may serve it without
1612 # any complaint (see https://github.com/rg3/youtube-dl/issues/7362,
1613 # the original webpage as well as el=info and el=embedded get_video_info
1614 # requests report video unavailability due to geo restriction while
1615 # el=detailpage succeeds and returns valid data). This is probably
1616 # due to YouTube measures against IP ranges of hosting providers.
1617 # Working around by preferring the first succeeded video_info containing
1618 # the token if no such video_info yet was found.
44b2264f
S
1619 if 'token' not in video_info:
1620 video_info = get_video_info
4e62ebe2 1621 break
bbb7c3f7
YCH
1622
1623 def extract_unavailable_message():
1624 return self._html_search_regex(
1625 r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>',
1626 video_webpage, 'unavailable message', default=None)
1627
c5e8d7af
PH
1628 if 'token' not in video_info:
1629 if 'reason' in video_info:
af214c3a 1630 if 'The uploader has not made this video available in your country.' in video_info['reason']:
fd5c4aab
S
1631 regions_allowed = self._html_search_meta(
1632 'regionsAllowed', video_webpage, default=None)
1633 countries = regions_allowed.split(',') if regions_allowed else None
1634 self.raise_geo_restricted(
1635 msg=video_info['reason'][0], countries=countries)
bbb7c3f7
YCH
1636 reason = video_info['reason'][0]
1637 if 'Invalid parameters' in reason:
1638 unavailable_message = extract_unavailable_message()
1639 if unavailable_message:
1640 reason = unavailable_message
d11271dd 1641 raise ExtractorError(
bbb7c3f7 1642 'YouTube said: %s' % reason,
d11271dd 1643 expected=True, video_id=video_id)
c5e8d7af 1644 else:
d11271dd 1645 raise ExtractorError(
78caa52a 1646 '"token" parameter not in video info for unknown reason',
d11271dd 1647 video_id=video_id)
c5e8d7af 1648
cf7e015f
S
1649 # title
1650 if 'title' in video_info:
1651 video_title = video_info['title'][0]
1652 else:
1653 self._downloader.report_warning('Unable to extract video title')
1654 video_title = '_'
1655
1656 # description
9cafc3fd 1657 description_original = video_description = get_element_by_id("eow-description", video_webpage)
cf7e015f 1658 if video_description:
fa4bc6e7
RA
1659
1660 def replace_url(m):
1661 redir_url = compat_urlparse.urljoin(url, m.group(1))
1662 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1663 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1664 qs = compat_parse_qs(parsed_redir_url.query)
1665 q = qs.get('q')
1666 if q and q[0]:
1667 return q[0]
1668 return redir_url
1669
9cafc3fd 1670 description_original = video_description = re.sub(r'''(?x)
cf7e015f 1671 <a\s+
25cb7a0e 1672 (?:[a-zA-Z-]+="[^"]*"\s+)*?
23f13e97 1673 (?:title|href)="([^"]+)"\s+
25cb7a0e 1674 (?:[a-zA-Z-]+="[^"]*"\s+)*?
525cedb9 1675 class="[^"]*"[^>]*>
23f13e97 1676 [^<]+\.{3}\s*
cf7e015f 1677 </a>
fa4bc6e7 1678 ''', replace_url, video_description)
cf7e015f
S
1679 video_description = clean_html(video_description)
1680 else:
1681 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1682 if fd_mobj:
1683 video_description = unescapeHTML(fd_mobj.group(1))
1684 else:
1685 video_description = ''
1686
5e1eddb9
S
1687 if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False):
1688 if not self._downloader.params.get('noplaylist'):
1689 entries = []
1690 feed_ids = []
6863631c 1691 multifeed_metadata_list = video_info['multifeed_metadata_list'][0]
5e1eddb9 1692 for feed in multifeed_metadata_list.split(','):
6863631c
S
1693 # Unquote should take place before split on comma (,) since textual
1694 # fields may contain comma as well (see
1695 # https://github.com/rg3/youtube-dl/issues/8536)
1696 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
5e1eddb9
S
1697 entries.append({
1698 '_type': 'url_transparent',
1699 'ie_key': 'Youtube',
1700 'url': smuggle_url(
1701 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1702 {'force_singlefeed': True}),
1703 'title': '%s (%s)' % (video_title, feed_data['title'][0]),
1704 })
1705 feed_ids.append(feed_data['id'][0])
1706 self.to_screen(
1707 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1708 % (', '.join(feed_ids), video_id))
1709 return self.playlist_result(entries, video_id, video_title, video_description)
1710 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
cf7e015f 1711
c7121fa7 1712 if view_count is None:
1c9c8de2 1713 view_count = extract_view_count(video_info)
1d699755 1714
c5e8d7af
PH
1715 # Check for "rental" videos
1716 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
c9612c04 1717 raise ExtractorError('"rental" videos not supported. See https://github.com/rg3/youtube-dl/issues/359 for more information.', expected=True)
c5e8d7af 1718
c63ca0ee
S
1719 def _extract_filesize(media_url):
1720 return int_or_none(self._search_regex(
1721 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
1722
c5e8d7af
PH
1723 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1724 self.report_rtmp_download()
dd27fd17
PH
1725 formats = [{
1726 'format_id': '_rtmp',
1727 'protocol': 'rtmp',
1728 'url': video_info['conn'][0],
1729 'player_url': player_url,
1730 }]
391dd6f0 1731 elif not is_live and (len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
5f6a1245 1732 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 1733 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1734 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
3318832e 1735 formats_spec = {}
82156fdb 1736 fmt_list = video_info.get('fmt_list', [''])[0]
1737 if fmt_list:
1738 for fmt in fmt_list.split(','):
1739 spec = fmt.split('/')
3318832e 1740 if len(spec) > 1:
1741 width_height = spec[1].split('x')
1742 if len(width_height) == 2:
1743 formats_spec[spec[0]] = {
1744 'resolution': spec[1],
1745 'width': int_or_none(width_height[0]),
1746 'height': int_or_none(width_height[1]),
1747 }
54fc90aa 1748 q = qualities(['small', 'medium', 'hd720'])
c9afb51c 1749 formats = []
00fe14fc 1750 for url_data_str in encoded_url_map.split(','):
c5e8d7af 1751 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
1752 if 'itag' not in url_data or 'url' not in url_data:
1753 continue
1754 format_id = url_data['itag'][0]
1755 url = url_data['url'][0]
1756
a49eccdf 1757 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
6449cd80 1758 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
beb95e77 1759 jsplayer_url_json = self._search_regex(
6449cd80
PH
1760 ASSETS_RE,
1761 embed_webpage if age_gate else video_webpage,
1762 'JS player URL (1)', default=None)
1763 if not jsplayer_url_json and not age_gate:
1764 # We need the embed website after all
1765 if embed_webpage is None:
1766 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1767 embed_webpage = self._download_webpage(
1768 embed_url, video_id, 'Downloading embed webpage')
1769 jsplayer_url_json = self._search_regex(
1770 ASSETS_RE, embed_webpage, 'JS player URL')
1771
beb95e77 1772 player_url = json.loads(jsplayer_url_json)
201e9eaa
PH
1773 if player_url is None:
1774 player_url_json = self._search_regex(
1775 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 1776 video_webpage, 'age gate player URL')
201e9eaa
PH
1777 player_url = json.loads(player_url_json)
1778
a49eccdf
YCH
1779 if 'sig' in url_data:
1780 url += '&signature=' + url_data['sig'][0]
1781 elif 's' in url_data:
1782 encrypted_sig = url_data['s'][0]
1783
201e9eaa 1784 if self._downloader.params.get('verbose'):
cf010131 1785 if player_url is None:
201e9eaa
PH
1786 player_version = 'unknown'
1787 player_desc = 'unknown'
1788 else:
1789 if player_url.endswith('swf'):
1790 player_version = self._search_regex(
1791 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 1792 'flash player', fatal=False)
201e9eaa 1793 player_desc = 'flash player %s' % player_version
cf010131 1794 else:
201e9eaa 1795 player_version = self._search_regex(
b62985a9
YCH
1796 [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',
1797 r'(?:www|player)-([^/]+)(?:/[a-z]{2}_[A-Z]{2})?/base\.js'],
201e9eaa
PH
1798 player_url,
1799 'html5 player', fatal=False)
78caa52a 1800 player_desc = 'html5 player %s' % player_version
201e9eaa 1801
60064c53 1802 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 1803 self.to_screen('{%s} signature length %s, %s' %
9e1a5b84 1804 (format_id, parts_sizes, player_desc))
201e9eaa
PH
1805
1806 signature = self._decrypt_signature(
1807 encrypted_sig, video_id, player_url, age_gate)
1808 url += '&signature=' + signature
1809 if 'ratebypass' not in url:
1810 url += '&ratebypass=yes'
c9afb51c 1811
94278f72
YCH
1812 dct = {
1813 'format_id': format_id,
1814 'url': url,
1815 'player_url': player_url,
1816 }
1817 if format_id in self._formats:
1818 dct.update(self._formats[format_id])
3318832e 1819 if format_id in formats_spec:
1820 dct.update(formats_spec[format_id])
94278f72 1821
aabc2be6
S
1822 # Some itags are not included in DASH manifest thus corresponding formats will
1823 # lack metadata (see https://github.com/rg3/youtube-dl/pull/5993).
1824 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
1825 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
1826 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
94278f72 1827
c63ca0ee
S
1828 filesize = int_or_none(url_data.get(
1829 'clen', [None])[0]) or _extract_filesize(url)
1830
54fc90aa
RA
1831 quality = url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0]
1832
94278f72 1833 more_fields = {
c63ca0ee 1834 'filesize': filesize,
aabc2be6 1835 'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000),
c9afb51c
AH
1836 'width': width,
1837 'height': height,
1838 'fps': int_or_none(url_data.get('fps', [None])[0]),
54fc90aa
RA
1839 'format_note': quality,
1840 'quality': q(quality),
c9afb51c 1841 }
94278f72
YCH
1842 for key, value in more_fields.items():
1843 if value:
1844 dct[key] = value
aabc2be6
S
1845 type_ = url_data.get('type', [None])[0]
1846 if type_:
1847 type_split = type_.split(';')
1848 kind_ext = type_split[0].split('/')
1849 if len(kind_ext) == 2:
94278f72
YCH
1850 kind, _ = kind_ext
1851 dct['ext'] = mimetype2ext(type_split[0])
aabc2be6
S
1852 if kind in ('audio', 'video'):
1853 codecs = None
1854 for mobj in re.finditer(
1855 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
1856 if mobj.group('key') == 'codecs':
1857 codecs = mobj.group('val')
1858 break
1859 if codecs:
6310acf5 1860 dct.update(parse_codecs(codecs))
e4a60912
S
1861 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
1862 dct['downloader_options'] = {
1863 # Youtube throttles chunks >~10M
1864 'http_chunk_size': 10485760,
1865 }
aabc2be6 1866 formats.append(dct)
1d043b93
JMF
1867 elif video_info.get('hlsvp'):
1868 manifest_url = video_info['hlsvp'][0]
89beedd3
RA
1869 formats = []
1870 m3u8_formats = self._extract_m3u8_formats(
1871 manifest_url, video_id, 'mp4', fatal=False)
1872 for a_format in m3u8_formats:
1873 itag = self._search_regex(
1874 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
1875 if itag:
1876 a_format['format_id'] = itag
1877 if itag in self._formats:
1878 dct = self._formats[itag].copy()
1879 dct.update(a_format)
1880 a_format = dct
1881 a_format['player_url'] = player_url
1882 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
049d71d8 1883 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
89beedd3 1884 formats.append(a_format)
c5e8d7af 1885 else:
4c76aa06
RA
1886 error_message = clean_html(video_info.get('reason', [None])[0])
1887 if not error_message:
1888 error_message = extract_unavailable_message()
1889 if error_message:
1890 raise ExtractorError(error_message, expected=True)
69ea8ca4 1891 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1892
7e72694b
S
1893 # uploader
1894 video_uploader = try_get(video_info, lambda x: x['author'][0], compat_str)
1895 if video_uploader:
1896 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
1897 else:
1898 self._downloader.report_warning('unable to extract uploader name')
1899
1900 # uploader_id
1901 video_uploader_id = None
1902 video_uploader_url = None
1903 mobj = re.search(
1904 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
1905 video_webpage)
1906 if mobj is not None:
1907 video_uploader_id = mobj.group('uploader_id')
1908 video_uploader_url = mobj.group('uploader_url')
1909 else:
1910 self._downloader.report_warning('unable to extract uploader nickname')
1911
dd4c4492
S
1912 channel_id = self._html_search_meta(
1913 'channelId', video_webpage, 'channel id')
1914 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
1915
7e72694b
S
1916 # thumbnail image
1917 # We try first to get a high quality image:
1918 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1919 video_webpage, re.DOTALL)
1920 if m_thumb is not None:
1921 video_thumbnail = m_thumb.group(1)
1922 elif 'thumbnail_url' not in video_info:
1923 self._downloader.report_warning('unable to extract video thumbnail')
1924 video_thumbnail = None
1925 else: # don't panic if we can't find it
1926 video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
1927
1928 # upload date
1929 upload_date = self._html_search_meta(
1930 'datePublished', video_webpage, 'upload date', default=None)
1931 if not upload_date:
1932 upload_date = self._search_regex(
1933 [r'(?s)id="eow-date.*?>(.*?)</span>',
1934 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
1935 video_webpage, 'upload date', default=None)
1936 upload_date = unified_strdate(upload_date)
1937
1938 video_license = self._html_search_regex(
1939 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
1940 video_webpage, 'license', default=None)
1941
1942 m_music = re.search(
1943 r'''(?x)
1944 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
1945 <ul[^>]*>\s*
1946 <li>(?P<title>.+?)
1947 by (?P<creator>.+?)
1948 (?:
1949 \(.+?\)|
1950 <a[^>]*
1951 (?:
1952 \bhref=["\']/red[^>]*>| # drop possible
1953 >\s*Listen ad-free with YouTube Red # YouTube Red ad
1954 )
1955 .*?
1956 )?</li
1957 ''',
1958 video_webpage)
1959 if m_music:
1960 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
1961 video_creator = clean_html(m_music.group('creator'))
1962 else:
1963 video_alt_title = video_creator = None
1964
1965 def extract_meta(field):
1966 return self._html_search_regex(
1967 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
1968 video_webpage, field, default=None)
1969
1970 track = extract_meta('Song')
1971 artist = extract_meta('Artist')
1972
1973 m_episode = re.search(
1974 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
1975 video_webpage)
1976 if m_episode:
1977 series = m_episode.group('series')
1978 season_number = int(m_episode.group('season'))
1979 episode_number = int(m_episode.group('episode'))
1980 else:
1981 series = season_number = episode_number = None
1982
1983 m_cat_container = self._search_regex(
1984 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
1985 video_webpage, 'categories', default=None)
1986 if m_cat_container:
1987 category = self._html_search_regex(
1988 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
1989 default=None)
1990 video_categories = None if category is None else [category]
1991 else:
1992 video_categories = None
1993
1994 video_tags = [
1995 unescapeHTML(m.group('content'))
1996 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
1997
1998 def _extract_count(count_name):
1999 return str_to_int(self._search_regex(
2000 r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
2001 % re.escape(count_name),
2002 video_webpage, count_name, default=None))
2003
2004 like_count = _extract_count('like')
2005 dislike_count = _extract_count('dislike')
2006
2007 # subtitles
2008 video_subtitles = self.extract_subtitles(video_id, video_webpage)
2009 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
2010
2011 video_duration = try_get(
2012 video_info, lambda x: int_or_none(x['length_seconds'][0]))
2013 if not video_duration:
2014 video_duration = parse_duration(self._html_search_meta(
2015 'duration', video_webpage, 'video duration'))
2016
2017 # annotations
2018 video_annotations = None
2019 if self._downloader.params.get('writeannotations', False):
2020 video_annotations = self._extract_annotations(video_id)
2021
2022 chapters = self._extract_chapters(description_original, video_duration)
2023
dd27fd17 2024 # Look for the DASH manifest
203fb43f 2025 if self._downloader.params.get('youtube_include_dash_manifest', True):
77c6fb5b 2026 dash_mpd_fatal = True
8ff648e4 2027 for mpd_url in dash_mpds:
d8d24a92 2028 dash_formats = {}
774e208f 2029 try:
05d0d131
YCH
2030 def decrypt_sig(mobj):
2031 s = mobj.group(1)
2032 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2033 return '/signature/%s' % dec_s
2034
8ff648e4 2035 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2d2fa82d 2036
8ff648e4 2037 for df in self._extract_mpd_formats(
2038 mpd_url, video_id, fatal=dash_mpd_fatal,
2039 formats_dict=self._formats):
c63ca0ee
S
2040 if not df.get('filesize'):
2041 df['filesize'] = _extract_filesize(df['url'])
d8d24a92
S
2042 # Do not overwrite DASH format found in some previous DASH manifest
2043 if df['format_id'] not in dash_formats:
2044 dash_formats[df['format_id']] = df
77c6fb5b
S
2045 # Additional DASH manifests may end up in HTTP Error 403 therefore
2046 # allow them to fail without bug report message if we already have
2047 # some DASH manifest succeeded. This is temporary workaround to reduce
2048 # burst of bug reports until we figure out the reason and whether it
2049 # can be fixed at all.
2050 dash_mpd_fatal = False
774e208f
PH
2051 except (ExtractorError, KeyError) as e:
2052 self.report_warning(
2053 'Skipping DASH manifest: %r' % e, video_id)
d8d24a92 2054 if dash_formats:
04b3b3df
JMF
2055 # Remove the formats we found through non-DASH, they
2056 # contain less info and it can be wrong, because we use
2057 # fixed values (for example the resolution). See
2058 # https://github.com/rg3/youtube-dl/issues/5774 for an
2059 # example.
d80265cc 2060 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
d8d24a92 2061 formats.extend(dash_formats.values())
d80044c2 2062
6271f1ca
PH
2063 # Check for malformed aspect ratio
2064 stretched_m = re.search(
2065 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2066 video_webpage)
2067 if stretched_m:
313dfc45
LL
2068 w = float(stretched_m.group('w'))
2069 h = float(stretched_m.group('h'))
5faf9fed
S
2070 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2071 # We will only process correct ratios.
313dfc45 2072 if w > 0 and h > 0:
41f24c32 2073 ratio = w / h
313dfc45
LL
2074 for f in formats:
2075 if f.get('vcodec') != 'none':
2076 f['stretched_ratio'] = ratio
6271f1ca 2077
4bcc7bd1 2078 self._sort_formats(formats)
4ea3be0a 2079
d77ab8e2
S
2080 self.mark_watched(video_id, video_info)
2081
4ea3be0a 2082 return {
8bcc8756
JW
2083 'id': video_id,
2084 'uploader': video_uploader,
2085 'uploader_id': video_uploader_id,
fd050249 2086 'uploader_url': video_uploader_url,
dd4c4492
S
2087 'channel_id': channel_id,
2088 'channel_url': channel_url,
8bcc8756 2089 'upload_date': upload_date,
7caf9830 2090 'license': video_license,
936784b2 2091 'creator': video_creator or artist,
8bcc8756 2092 'title': video_title,
936784b2 2093 'alt_title': video_alt_title or track,
8bcc8756
JW
2094 'thumbnail': video_thumbnail,
2095 'description': video_description,
2096 'categories': video_categories,
000b6b5a 2097 'tags': video_tags,
8bcc8756 2098 'subtitles': video_subtitles,
360e1ca5 2099 'automatic_captions': automatic_captions,
8bcc8756
JW
2100 'duration': video_duration,
2101 'age_limit': 18 if age_gate else 0,
2102 'annotations': video_annotations,
9cafc3fd 2103 'chapters': chapters,
7e8c0af0 2104 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 2105 'view_count': view_count,
4ea3be0a 2106 'like_count': like_count,
2107 'dislike_count': dislike_count,
2d30521a 2108 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
8bcc8756 2109 'formats': formats,
2fe1ff85 2110 'is_live': is_live,
7c80519c 2111 'start_time': start_time,
297a564b 2112 'end_time': end_time,
12afdc2a
S
2113 'series': series,
2114 'season_number': season_number,
2115 'episode_number': episode_number,
936784b2
S
2116 'track': track,
2117 'artist': artist,
4ea3be0a 2118 }
c5e8d7af 2119
5f6a1245 2120
8e7aad20 2121class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
78caa52a 2122 IE_DESC = 'YouTube.com playlists'
d67cc9fa 2123 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
2124 (?:https?://)?
2125 (?:\w+\.)?
c5e8d7af 2126 (?:
feaa5ad7
S
2127 youtube\.com/
2128 (?:
87dadd45 2129 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))
feaa5ad7
S
2130 \? (?:.*?[&;])*? (?:p|a|list)=
2131 | p/
2132 )|
2133 youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
c5e8d7af 2134 )
d67cc9fa 2135 (
409b9324 2136 (?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)?[0-9A-Za-z-_]{10,}
5f6a1245 2137 # Top tracks, they can also include dots
d67cc9fa
JMF
2138 |(?:MC)[\w\.]*
2139 )
c5e8d7af
PH
2140 .*
2141 |
d0ba5587
S
2142 (%(playlist_id)s)
2143 )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
8d81f3e3 2144 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
648e6a1f 2145 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'
78caa52a 2146 IE_NAME = 'youtube:playlist'
81127aa5
PH
2147 _TESTS = [{
2148 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
2149 'info_dict': {
2150 'title': 'ytdl test PL',
a1cf99d0 2151 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
81127aa5
PH
2152 },
2153 'playlist_count': 3,
9291475f
PH
2154 }, {
2155 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
2156 'info_dict': {
acf757f4 2157 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
9291475f
PH
2158 'title': 'YDL_Empty_List',
2159 },
2160 'playlist_count': 0,
4201ba13 2161 'skip': 'This playlist is private',
9291475f
PH
2162 }, {
2163 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2164 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2165 'info_dict': {
2166 'title': '29C3: Not my department',
acf757f4 2167 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
9291475f
PH
2168 },
2169 'playlist_count': 95,
2170 }, {
2171 'note': 'issue #673',
2172 'url': 'PLBB231211A4F62143',
2173 'info_dict': {
f46a8702 2174 'title': '[OLD]Team Fortress 2 (Class-based LP)',
acf757f4 2175 'id': 'PLBB231211A4F62143',
9291475f
PH
2176 },
2177 'playlist_mincount': 26,
2178 }, {
2179 'note': 'Large playlist',
2180 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
2181 'info_dict': {
2182 'title': 'Uploads from Cauchemar',
acf757f4 2183 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
9291475f
PH
2184 },
2185 'playlist_mincount': 799,
2186 }, {
2187 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2188 'info_dict': {
2189 'title': 'YDL_safe_search',
acf757f4 2190 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
9291475f
PH
2191 },
2192 'playlist_count': 2,
4201ba13 2193 'skip': 'This playlist is private',
ac7553d0
PH
2194 }, {
2195 'note': 'embedded',
2d3d2997 2196 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
ac7553d0
PH
2197 'playlist_count': 4,
2198 'info_dict': {
2199 'title': 'JODA15',
acf757f4 2200 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
ac7553d0 2201 }
87dadd45
S
2202 }, {
2203 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2204 'playlist_mincount': 485,
2205 'info_dict': {
2206 'title': '2017 華語最新單曲 (2/24更新)',
2207 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2208 }
6b08cdf6
PH
2209 }, {
2210 'note': 'Embedded SWF player',
2d3d2997 2211 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
6b08cdf6
PH
2212 'playlist_count': 4,
2213 'info_dict': {
2214 'title': 'JODA7',
acf757f4 2215 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
6b08cdf6 2216 }
4b7df0d3
JMF
2217 }, {
2218 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2219 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2220 'info_dict': {
acf757f4
PH
2221 'title': 'Uploads from Interstellar Movie',
2222 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
4b7df0d3 2223 },
481cc733 2224 'playlist_mincount': 21,
dacb3a86
S
2225 }, {
2226 # Playlist URL that does not actually serve a playlist
2227 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2228 'info_dict': {
2229 'id': 'FqZTN594JQw',
2230 'ext': 'webm',
2231 'title': "Smiley's People 01 detective, Adventure Series, Action",
2232 'uploader': 'STREEM',
2233 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
ec85ded8 2234 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
dacb3a86
S
2235 'upload_date': '20150526',
2236 'license': 'Standard YouTube License',
2237 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2238 'categories': ['People & Blogs'],
2239 'tags': list,
2240 'like_count': int,
2241 'dislike_count': int,
2242 },
2243 'params': {
2244 'skip_download': True,
2245 },
2246 'add_ie': [YoutubeIE.ie_key()],
481cc733
S
2247 }, {
2248 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
2249 'info_dict': {
2250 'id': 'yeWKywCrFtk',
2251 'ext': 'mp4',
2252 'title': 'Small Scale Baler and Braiding Rugs',
2253 'uploader': 'Backus-Page House Museum',
2254 'uploader_id': 'backuspagemuseum',
ec85ded8 2255 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
481cc733
S
2256 'upload_date': '20161008',
2257 'license': 'Standard YouTube License',
2258 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
2259 'categories': ['Nonprofits & Activism'],
2260 'tags': list,
2261 'like_count': int,
2262 'dislike_count': int,
2263 },
2264 'params': {
2265 'noplaylist': True,
2266 'skip_download': True,
2267 },
feaa5ad7
S
2268 }, {
2269 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
2270 'only_matching': True,
a6857510
S
2271 }, {
2272 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
2273 'only_matching': True,
409b9324
S
2274 }, {
2275 # music album playlist
2276 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
2277 'only_matching': True,
81127aa5 2278 }]
c5e8d7af 2279
880e1c52
JMF
2280 def _real_initialize(self):
2281 self._login()
2282
652cdaa2 2283 def _extract_mix(self, playlist_id):
99209c29 2284 # The mixes are generated from a single video
652cdaa2 2285 # the id of the playlist is just 'RD' + video_id
1b6182d8
JMF
2286 ids = []
2287 last_id = playlist_id[-11:]
2288 for n in itertools.count(1):
2289 url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
2290 webpage = self._download_webpage(
2291 url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))
2292 new_ids = orderedSet(re.findall(
2293 r'''(?xs)data-video-username=".*?".*?
2294 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
2295 webpage))
2296 # Fetch new pages until all the videos are repeated, it seems that
2297 # there are always 51 unique videos.
2298 new_ids = [_id for _id in new_ids if _id not in ids]
2299 if not new_ids:
2300 break
2301 ids.extend(new_ids)
2302 last_id = ids[-1]
2303
2304 url_results = self._ids_to_results(ids)
2305
bc2f773b 2306 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
2307 title_span = (
2308 search_title('playlist-title') or
2309 search_title('title long-title') or
2310 search_title('title'))
76d1700b 2311 title = clean_html(title_span)
652cdaa2
JMF
2312
2313 return self.playlist_result(url_results, playlist_id, title)
2314
448830ce 2315 def _extract_playlist(self, playlist_id):
dbb94fb0
S
2316 url = self._TEMPLATE_URL % playlist_id
2317 page = self._download_webpage(url, playlist_id)
dbb94fb0 2318
8bc0800d
G
2319 # the yt-alert-message now has tabindex attribute (see https://github.com/rg3/youtube-dl/issues/11604)
2320 for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page):
39b62db1
YCH
2321 match = match.strip()
2322 # Check if the playlist exists or is private
4201ba13
S
2323 mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match)
2324 if mobj:
2325 reason = mobj.group('reason')
2326 message = 'This playlist %s' % reason
2327 if 'private' in reason:
2328 message += ', use --username or --netrc to access it'
2329 message += '.'
2330 raise ExtractorError(message, expected=True)
39b62db1
YCH
2331 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
2332 raise ExtractorError(
2333 'Invalid parameters. Maybe URL is incorrect.',
2334 expected=True)
2335 elif re.match(r'[^<]*Choose your language[^<]*', match):
2336 continue
2337 else:
2338 self.report_warning('Youtube gives an alert message: ' + match)
10c0e2d8 2339
dbb94fb0 2340 playlist_title = self._html_search_regex(
63b4295d 2341 r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
dacb3a86 2342 page, 'title', default=None)
c5e8d7af 2343
07aeced6
S
2344 _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref='
2345 uploader = self._search_regex(
2346 r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,
2347 page, 'uploader', default=None)
2348 mobj = re.search(
2349 r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE,
2350 page)
2351 if mobj:
2352 uploader_id = mobj.group('uploader_id')
2353 uploader_url = compat_urlparse.urljoin(url, mobj.group('path'))
2354 else:
2355 uploader_id = uploader_url = None
2356
dacb3a86
S
2357 has_videos = True
2358
2359 if not playlist_title:
2360 try:
2361 # Some playlist URLs don't actually serve a playlist (e.g.
2362 # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)
2363 next(self._entries(page, playlist_id))
2364 except StopIteration:
2365 has_videos = False
2366
07aeced6 2367 playlist = self.playlist_result(
dacb3a86 2368 self._entries(page, playlist_id), playlist_id, playlist_title)
07aeced6
S
2369 playlist.update({
2370 'uploader': uploader,
2371 'uploader_id': uploader_id,
2372 'uploader_url': uploader_url,
2373 })
2374
2375 return has_videos, playlist
c5e8d7af 2376
ebf1b291 2377 def _check_download_just_video(self, url, playlist_id):
448830ce
S
2378 # Check if it's a video-specific URL
2379 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
481cc733 2380 video_id = query_dict.get('v', [None])[0] or self._search_regex(
87dadd45 2381 r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url,
481cc733
S
2382 'video id', default=None)
2383 if video_id:
448830ce
S
2384 if self._downloader.params.get('noplaylist'):
2385 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
dacb3a86 2386 return video_id, self.url_result(video_id, 'Youtube', video_id=video_id)
448830ce
S
2387 else:
2388 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
dacb3a86
S
2389 return video_id, None
2390 return None, None
448830ce 2391
ebf1b291
S
2392 def _real_extract(self, url):
2393 # Extract playlist id
2394 mobj = re.match(self._VALID_URL, url)
2395 if mobj is None:
2396 raise ExtractorError('Invalid URL: %s' % url)
2397 playlist_id = mobj.group(1) or mobj.group(2)
2398
dacb3a86 2399 video_id, video = self._check_download_just_video(url, playlist_id)
ebf1b291
S
2400 if video:
2401 return video
2402
466a6145 2403 if playlist_id.startswith(('RD', 'UL', 'PU')):
448830ce
S
2404 # Mixes require a custom extraction process
2405 return self._extract_mix(playlist_id)
2406
dacb3a86
S
2407 has_videos, playlist = self._extract_playlist(playlist_id)
2408 if has_videos or not video_id:
2409 return playlist
2410
2411 # Some playlist URLs don't actually serve a playlist (see
2412 # https://github.com/rg3/youtube-dl/issues/10537).
2413 # Fallback to plain video extraction if there is a video id
2414 # along with playlist id.
2415 return self.url_result(video_id, 'Youtube', video_id=video_id)
448830ce 2416
c5e8d7af 2417
648e6a1f 2418class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
78caa52a 2419 IE_DESC = 'YouTube.com channels'
9ff67727 2420 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
eb0f3e7e 2421 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
648e6a1f 2422 _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
78caa52a 2423 IE_NAME = 'youtube:channel'
cdc628a4
PH
2424 _TESTS = [{
2425 'note': 'paginated channel',
2426 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
2427 'playlist_mincount': 91,
acf757f4 2428 'info_dict': {
9170ca5b
JMF
2429 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
2430 'title': 'Uploads from lex will',
acf757f4 2431 }
5c43afd4
JMF
2432 }, {
2433 'note': 'Age restricted channel',
2434 # from https://www.youtube.com/user/DeusExOfficial
2435 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',
2436 'playlist_mincount': 64,
2437 'info_dict': {
2438 'id': 'UUs0ifCMCm1icqRbqhUINa0w',
2439 'title': 'Uploads from Deus Ex',
2440 },
cdc628a4 2441 }]
c5e8d7af 2442
e462474e
S
2443 @classmethod
2444 def suitable(cls, url):
f07e276a
S
2445 return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)
2446 else super(YoutubeChannelIE, cls).suitable(url))
e462474e 2447
9558dcec
S
2448 def _build_template_url(self, url, channel_id):
2449 return self._TEMPLATE_URL % channel_id
2450
c5e8d7af 2451 def _real_extract(self, url):
9ff67727 2452 channel_id = self._match_id(url)
c5e8d7af 2453
9558dcec 2454 url = self._build_template_url(url, channel_id)
386bdfa6
S
2455
2456 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
2457 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
2458 # otherwise fallback on channel by page extraction
2459 channel_page = self._download_webpage(
2460 url + '?view=57', channel_id,
2461 'Downloading channel page', fatal=False)
2b3c2546
PH
2462 if channel_page is False:
2463 channel_playlist_id = False
2464 else:
2465 channel_playlist_id = self._html_search_meta(
2466 'channelId', channel_page, 'channel id', default=None)
2467 if not channel_playlist_id:
73c4ac2c
S
2468 channel_url = self._html_search_meta(
2469 ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),
2470 channel_page, 'channel url', default=None)
2471 if channel_url:
2472 channel_playlist_id = self._search_regex(
2473 r'vnd\.youtube://user/([0-9A-Za-z_-]+)',
2474 channel_url, 'channel id', default=None)
386bdfa6
S
2475 if channel_playlist_id and channel_playlist_id.startswith('UC'):
2476 playlist_id = 'UU' + channel_playlist_id[2:]
d2a9de78
IK
2477 return self.url_result(
2478 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
386bdfa6 2479
60bf45c8 2480 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
31812a9e
PH
2481 autogenerated = re.search(r'''(?x)
2482 class="[^"]*?(?:
2483 channel-header-autogenerated-label|
2484 yt-channel-title-autogenerated
2485 )[^"]*"''', channel_page) is not None
c5e8d7af 2486
b9643eed
JMF
2487 if autogenerated:
2488 # The videos are contained in a single page
2489 # the ajax pages can't be used, they are empty
b82f815f 2490 entries = [
fb69240c
S
2491 self.url_result(
2492 video_id, 'Youtube', video_id=video_id,
2493 video_title=video_title)
8f02ad4f 2494 for video_id, video_title in self.extract_videos_from_page(channel_page)]
b82f815f
PH
2495 return self.playlist_result(entries, channel_id)
2496
73c4ac2c
S
2497 try:
2498 next(self._entries(channel_page, channel_id))
2499 except StopIteration:
2500 alert_message = self._html_search_regex(
2501 r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',
2502 channel_page, 'alert', default=None, group='alert')
2503 if alert_message:
2504 raise ExtractorError('Youtube said: %s' % alert_message, expected=True)
2505
648e6a1f 2506 return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
c5e8d7af
PH
2507
2508
eb0f3e7e 2509class YoutubeUserIE(YoutubeChannelIE):
78caa52a 2510 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
ea696249 2511 _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
9558dcec 2512 _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
78caa52a 2513 IE_NAME = 'youtube:user'
c5e8d7af 2514
cdc628a4
PH
2515 _TESTS = [{
2516 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
2517 'playlist_mincount': 320,
2518 'info_dict': {
73c4ac2c
S
2519 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
2520 'title': 'Uploads from The Linux Foundation',
cdc628a4 2521 }
9558dcec
S
2522 }, {
2523 # Only available via https://www.youtube.com/c/12minuteathlete/videos
2524 # but not https://www.youtube.com/user/12minuteathlete/videos
2525 'url': 'https://www.youtube.com/c/12minuteathlete/videos',
2526 'playlist_mincount': 249,
2527 'info_dict': {
2528 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
2529 'title': 'Uploads from 12 Minute Athlete',
2530 }
cdc628a4
PH
2531 }, {
2532 'url': 'ytuser:phihag',
2533 'only_matching': True,
daa0df9e
YCH
2534 }, {
2535 'url': 'https://www.youtube.com/c/gametrailers',
2536 'only_matching': True,
9558dcec
S
2537 }, {
2538 'url': 'https://www.youtube.com/gametrailers',
2539 'only_matching': True,
73c4ac2c 2540 }, {
0e879f43 2541 # This channel is not available, geo restricted to JP
73c4ac2c
S
2542 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',
2543 'only_matching': True,
cdc628a4
PH
2544 }]
2545
e3ea4790 2546 @classmethod
f4b05232 2547 def suitable(cls, url):
e3ea4790
JMF
2548 # Don't return True if the url can be extracted with other youtube
2549 # extractor, the regex would is too permissive and it would match.
f3a58d46 2550 other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)
2551 if any(ie.suitable(url) for ie in other_yt_ies):
5f6a1245
JW
2552 return False
2553 else:
2554 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 2555
9558dcec
S
2556 def _build_template_url(self, url, channel_id):
2557 mobj = re.match(self._VALID_URL, url)
2558 return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))
2559
b05654f0 2560
f07e276a
S
2561class YoutubeLiveIE(YoutubeBaseInfoExtractor):
2562 IE_DESC = 'YouTube.com live streams'
073d5bf5 2563 _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'
f07e276a
S
2564 IE_NAME = 'youtube:live'
2565
2566 _TESTS = [{
2d3d2997 2567 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
f07e276a
S
2568 'info_dict': {
2569 'id': 'a48o2S1cPoo',
2570 'ext': 'mp4',
2571 'title': 'The Young Turks - Live Main Show',
2572 'uploader': 'The Young Turks',
2573 'uploader_id': 'TheYoungTurks',
ec85ded8 2574 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
f07e276a
S
2575 'upload_date': '20150715',
2576 'license': 'Standard YouTube License',
2577 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
2578 'categories': ['News & Politics'],
2579 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
2580 'like_count': int,
2581 'dislike_count': int,
2582 },
2583 'params': {
2584 'skip_download': True,
2585 },
2586 }, {
2d3d2997 2587 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
f07e276a 2588 'only_matching': True,
c1b2a085
S
2589 }, {
2590 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
2591 'only_matching': True,
073d5bf5
S
2592 }, {
2593 'url': 'https://www.youtube.com/TheYoungTurks/live',
2594 'only_matching': True,
f07e276a
S
2595 }]
2596
2597 def _real_extract(self, url):
2598 mobj = re.match(self._VALID_URL, url)
2599 channel_id = mobj.group('id')
2600 base_url = mobj.group('base_url')
2601 webpage = self._download_webpage(url, channel_id, fatal=False)
2602 if webpage:
2603 page_type = self._og_search_property(
e7f3529f 2604 'type', webpage, 'page type', default='')
f07e276a
S
2605 video_id = self._html_search_meta(
2606 'videoId', webpage, 'video id', default=None)
e7f3529f
S
2607 if page_type.startswith('video') and video_id and re.match(
2608 r'^[0-9A-Za-z_-]{11}$', video_id):
f07e276a
S
2609 return self.url_result(video_id, YoutubeIE.ie_key())
2610 return self.url_result(base_url)
2611
2612
e462474e
S
2613class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
2614 IE_DESC = 'YouTube.com user/channel playlists'
2615 _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists'
2616 IE_NAME = 'youtube:playlists'
0c148415 2617
e568c223 2618 _TESTS = [{
2d3d2997 2619 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
0c148415
S
2620 'playlist_mincount': 4,
2621 'info_dict': {
2622 'id': 'ThirstForScience',
2623 'title': 'Thirst for Science',
2624 },
e568c223
S
2625 }, {
2626 # with "Load more" button
2d3d2997 2627 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
e568c223
S
2628 'playlist_mincount': 70,
2629 'info_dict': {
2630 'id': 'igorkle1',
2631 'title': 'Игорь Клейнер',
2632 },
e462474e
S
2633 }, {
2634 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',
2635 'playlist_mincount': 17,
2636 'info_dict': {
2637 'id': 'UCiU1dHvZObB2iP6xkJ__Icw',
2638 'title': 'Chem Player',
2639 },
e568c223 2640 }]
0c148415
S
2641
2642
870f3bfc
S
2643class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):
2644 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'
2645
2646
2647class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
78caa52a 2648 IE_DESC = 'YouTube.com searches'
b4c08069
JMF
2649 # there doesn't appear to be a real limit, for example if you search for
2650 # 'python' you get more than 8.000.000 results
2651 _MAX_RESULTS = float('inf')
78caa52a 2652 IE_NAME = 'youtube:search'
b05654f0 2653 _SEARCH_KEY = 'ytsearch'
b4c08069 2654 _EXTRA_QUERY_ARGS = {}
9dd8e46a 2655 _TESTS = []
b05654f0 2656
b05654f0
PH
2657 def _get_n_results(self, query, n):
2658 """Get a specified number of results for a query"""
2659
b4c08069 2660 videos = []
b05654f0
PH
2661 limit = n
2662
a22b2fd1
YCH
2663 url_query = {
2664 'search_query': query.encode('utf-8'),
2665 }
2666 url_query.update(self._EXTRA_QUERY_ARGS)
2667 result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query)
2668
b4c08069 2669 for pagenum in itertools.count(1):
b4c08069 2670 data = self._download_json(
69ea8ca4 2671 result_url, video_id='query "%s"' % query,
b4c08069 2672 note='Downloading page %s' % pagenum,
a22b2fd1
YCH
2673 errnote='Unable to download API page',
2674 query={'spf': 'navigate'})
b4c08069 2675 html_content = data[1]['body']['content']
7cc3570e 2676
b4c08069 2677 if 'class="search-message' in html_content:
07ad22b8 2678 raise ExtractorError(
78caa52a 2679 '[youtube] No video results', expected=True)
b05654f0 2680
870f3bfc 2681 new_videos = list(self._process_page(html_content))
b4c08069
JMF
2682 videos += new_videos
2683 if not new_videos or len(videos) > limit:
2684 break
a22b2fd1
YCH
2685 next_link = self._html_search_regex(
2686 r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next',
2687 html_content, 'next link', default=None)
2688 if next_link is None:
2689 break
2690 result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link)
b05654f0 2691
b4c08069
JMF
2692 if len(videos) > n:
2693 videos = videos[:n]
b05654f0 2694 return self.playlist_result(videos, query)
75dff0ee 2695
c9ae7b95 2696
a3dd9248 2697class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 2698 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248 2699 _SEARCH_KEY = 'ytsearchdate'
78caa52a 2700 IE_DESC = 'YouTube.com searches, newest videos first'
b4c08069 2701 _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
75dff0ee 2702
c9ae7b95 2703
870f3bfc 2704class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):
78caa52a
PH
2705 IE_DESC = 'YouTube.com search URLs'
2706 IE_NAME = 'youtube:search_url'
d2c1f79f 2707 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
2708 _TESTS = [{
2709 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
2710 'playlist_mincount': 5,
2711 'info_dict': {
2712 'title': 'youtube-dl test video',
2713 }
d2c1f79f
S
2714 }, {
2715 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
2716 'only_matching': True,
cdc628a4 2717 }]
c9ae7b95
PH
2718
2719 def _real_extract(self, url):
2720 mobj = re.match(self._VALID_URL, url)
7fd002c0 2721 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
c9ae7b95 2722 webpage = self._download_webpage(url, query)
175c2e9e 2723 return self.playlist_result(self._process_page(webpage), playlist_title=query)
c9ae7b95
PH
2724
2725
136dadde 2726class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
78caa52a 2727 IE_DESC = 'YouTube.com (multi-season) shows'
92519402 2728 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 2729 IE_NAME = 'youtube:show'
cdc628a4 2730 _TESTS = [{
4003bd82 2731 'url': 'https://www.youtube.com/show/airdisasters',
8801255d 2732 'playlist_mincount': 5,
cdc628a4
PH
2733 'info_dict': {
2734 'id': 'airdisasters',
2735 'title': 'Air Disasters',
2736 }
2737 }]
75dff0ee
JMF
2738
2739 def _real_extract(self, url):
136dadde
S
2740 playlist_id = self._match_id(url)
2741 return super(YoutubeShowIE, self)._real_extract(
2742 'https://www.youtube.com/show/%s/playlists' % playlist_id)
04cc9617
JMF
2743
2744
b2e8bc1b 2745class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639 2746 """
25f14e9f 2747 Base class for feed extractors
d7ae0639
JMF
2748 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
2749 """
b2e8bc1b 2750 _LOGIN_REQUIRED = True
d7ae0639
JMF
2751
2752 @property
2753 def IE_NAME(self):
78caa52a 2754 return 'youtube:%s' % self._FEED_NAME
04cc9617 2755
81f0259b 2756 def _real_initialize(self):
b2e8bc1b 2757 self._login()
81f0259b 2758
3853309f 2759 def _entries(self, page):
2bc43303
JMF
2760 # The extraction process is the same as for playlists, but the regex
2761 # for the video ids doesn't contain an index
2762 ids = []
2763 more_widget_html = content_html = page
2bc43303
JMF
2764 for page_num in itertools.count(1):
2765 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
62c95fd5
S
2766
2767 # 'recommended' feed has infinite 'load more' and each new portion spins
2768 # the same videos in (sometimes) slightly different order, so we'll check
2769 # for unicity and break when portion has no new videos
3853309f 2770 new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches)))
62c95fd5
S
2771 if not new_ids:
2772 break
2773
2bc43303
JMF
2774 ids.extend(new_ids)
2775
3853309f
S
2776 for entry in self._ids_to_results(new_ids):
2777 yield entry
2778
2bc43303
JMF
2779 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
2780 if not mobj:
2781 break
2782
2783 more = self._download_json(
25f14e9f 2784 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
2bc43303
JMF
2785 'Downloading page #%s' % page_num,
2786 transform_source=uppercase_escape)
2787 content_html = more['content_html']
2788 more_widget_html = more['load_more_widget_html']
2789
3853309f
S
2790 def _real_extract(self, url):
2791 page = self._download_webpage(
2792 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
2793 self._PLAYLIST_TITLE)
25f14e9f 2794 return self.playlist_result(
3853309f 2795 self._entries(page), playlist_title=self._PLAYLIST_TITLE)
25f14e9f
S
2796
2797
2798class YoutubeWatchLaterIE(YoutubePlaylistIE):
2799 IE_NAME = 'youtube:watchlater'
2800 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
92519402 2801 _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'
25f14e9f 2802
bc7a9cd8
S
2803 _TESTS = [{
2804 'url': 'https://www.youtube.com/playlist?list=WL',
2805 'only_matching': True,
2806 }, {
2807 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',
2808 'only_matching': True,
2809 }]
25f14e9f
S
2810
2811 def _real_extract(self, url):
7e5dc339 2812 _, video = self._check_download_just_video(url, 'WL')
ebf1b291
S
2813 if video:
2814 return video
dacb3a86
S
2815 _, playlist = self._extract_playlist('WL')
2816 return playlist
f459d170 2817
5f6a1245 2818
c626a3d9 2819class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 2820 IE_NAME = 'youtube:favorites'
f3a34072 2821 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
92519402 2822 _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
2823 _LOGIN_REQUIRED = True
2824
2825 def _real_extract(self, url):
2826 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 2827 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 2828 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
2829
2830
25f14e9f
S
2831class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
2832 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
92519402 2833 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'
25f14e9f
S
2834 _FEED_NAME = 'recommended'
2835 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1ed5b5c9 2836
1ed5b5c9 2837
25f14e9f
S
2838class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
2839 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
92519402 2840 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
25f14e9f
S
2841 _FEED_NAME = 'subscriptions'
2842 _PLAYLIST_TITLE = 'Youtube Subscriptions'
1ed5b5c9 2843
1ed5b5c9 2844
25f14e9f
S
2845class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
2846 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
92519402 2847 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
25f14e9f
S
2848 _FEED_NAME = 'history'
2849 _PLAYLIST_TITLE = 'Youtube History'
1ed5b5c9
JMF
2850
2851
15870e90
PH
2852class YoutubeTruncatedURLIE(InfoExtractor):
2853 IE_NAME = 'youtube:truncated_url'
2854 IE_DESC = False # Do not list
975d35db 2855 _VALID_URL = r'''(?x)
b95aab84
PH
2856 (?:https?://)?
2857 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
2858 (?:watch\?(?:
c4808c60 2859 feature=[a-z_]+|
b95aab84
PH
2860 annotation_id=annotation_[^&]+|
2861 x-yt-cl=[0-9]+|
c1708b89 2862 hl=[^&]*|
287be8c6 2863 t=[0-9]+
b95aab84
PH
2864 )?
2865 |
2866 attribution_link\?a=[^&]+
2867 )
2868 $
975d35db 2869 '''
15870e90 2870
c4808c60 2871 _TESTS = [{
2d3d2997 2872 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
c4808c60 2873 'only_matching': True,
dc2fc736 2874 }, {
2d3d2997 2875 'url': 'https://www.youtube.com/watch?',
dc2fc736 2876 'only_matching': True,
b95aab84
PH
2877 }, {
2878 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
2879 'only_matching': True,
2880 }, {
2881 'url': 'https://www.youtube.com/watch?feature=foo',
2882 'only_matching': True,
c1708b89
PH
2883 }, {
2884 'url': 'https://www.youtube.com/watch?hl=en-GB',
2885 'only_matching': True,
287be8c6
PH
2886 }, {
2887 'url': 'https://www.youtube.com/watch?t=2372',
2888 'only_matching': True,
c4808c60
PH
2889 }]
2890
15870e90
PH
2891 def _real_extract(self, url):
2892 raise ExtractorError(
78caa52a
PH
2893 'Did you forget to quote the URL? Remember that & is a meta '
2894 'character in most shells, so you want to put the URL in quotes, '
2895 'like youtube-dl '
2d3d2997 2896 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
78caa52a 2897 ' or simply youtube-dl BaW_jenozKc .',
15870e90 2898 expected=True)
772fd5cc
PH
2899
2900
2901class YoutubeTruncatedIDIE(InfoExtractor):
2902 IE_NAME = 'youtube:truncated_id'
2903 IE_DESC = False # Do not list
b95aab84 2904 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
2905
2906 _TESTS = [{
2907 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
2908 'only_matching': True,
2909 }]
2910
2911 def _real_extract(self, url):
2912 video_id = self._match_id(url)
2913 raise ExtractorError(
2914 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
2915 expected=True)