]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/youtube.py
[zattoo] Add support for more zattoo platform sites
[yt-dlp.git] / youtube_dl / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5
6 import itertools
7 import json
8 import os.path
9 import random
10 import re
11 import time
12 import traceback
13
14 from .common import InfoExtractor, SearchInfoExtractor
15 from ..jsinterp import JSInterpreter
16 from ..swfinterp import SWFInterpreter
17 from ..compat import (
18 compat_chr,
19 compat_kwargs,
20 compat_parse_qs,
21 compat_urllib_parse_unquote,
22 compat_urllib_parse_unquote_plus,
23 compat_urllib_parse_urlencode,
24 compat_urllib_parse_urlparse,
25 compat_urlparse,
26 compat_str,
27 )
28 from ..utils import (
29 clean_html,
30 error_to_compat_str,
31 ExtractorError,
32 float_or_none,
33 get_element_by_attribute,
34 get_element_by_id,
35 int_or_none,
36 mimetype2ext,
37 orderedSet,
38 parse_codecs,
39 parse_duration,
40 qualities,
41 remove_quotes,
42 remove_start,
43 smuggle_url,
44 str_to_int,
45 try_get,
46 unescapeHTML,
47 unified_strdate,
48 unsmuggle_url,
49 uppercase_escape,
50 urlencode_postdata,
51 )
52
53
54 class YoutubeBaseInfoExtractor(InfoExtractor):
55 """Provide base functions for Youtube extractors"""
56 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
57 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
58
59 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
60 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
61 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
62
63 _NETRC_MACHINE = 'youtube'
64 # If True it will raise an error if no login info is provided
65 _LOGIN_REQUIRED = False
66
67 _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)[0-9A-Za-z-_]{10,}'
68
69 def _set_language(self):
70 self._set_cookie(
71 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
72 # YouTube sets the expire time to about two months
73 expire_time=time.time() + 2 * 30 * 24 * 3600)
74
75 def _ids_to_results(self, ids):
76 return [
77 self.url_result(vid_id, 'Youtube', video_id=vid_id)
78 for vid_id in ids]
79
80 def _login(self):
81 """
82 Attempt to log in to YouTube.
83 True is returned if successful or skipped.
84 False is returned if login failed.
85
86 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
87 """
88 username, password = self._get_login_info()
89 # No authentication to be performed
90 if username is None:
91 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
92 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
93 return True
94
95 login_page = self._download_webpage(
96 self._LOGIN_URL, None,
97 note='Downloading login page',
98 errnote='unable to fetch login page', fatal=False)
99 if login_page is False:
100 return
101
102 login_form = self._hidden_inputs(login_page)
103
104 def req(url, f_req, note, errnote):
105 data = login_form.copy()
106 data.update({
107 'pstMsg': 1,
108 'checkConnection': 'youtube',
109 'checkedDomains': 'youtube',
110 'hl': 'en',
111 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
112 'f.req': json.dumps(f_req),
113 'flowName': 'GlifWebSignIn',
114 'flowEntry': 'ServiceLogin',
115 })
116 return self._download_json(
117 url, None, note=note, errnote=errnote,
118 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
119 fatal=False,
120 data=urlencode_postdata(data), headers={
121 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
122 'Google-Accounts-XSRF': 1,
123 })
124
125 def warn(message):
126 self._downloader.report_warning(message)
127
128 lookup_req = [
129 username,
130 None, [], None, 'US', None, None, 2, False, True,
131 [
132 None, None,
133 [2, 1, None, 1,
134 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
135 None, [], 4],
136 1, [None, None, []], None, None, None, True
137 ],
138 username,
139 ]
140
141 lookup_results = req(
142 self._LOOKUP_URL, lookup_req,
143 'Looking up account info', 'Unable to look up account info')
144
145 if lookup_results is False:
146 return False
147
148 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
149 if not user_hash:
150 warn('Unable to extract user hash')
151 return False
152
153 challenge_req = [
154 user_hash,
155 None, 1, None, [1, None, None, None, [password, None, True]],
156 [
157 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
158 1, [None, None, []], None, None, None, True
159 ]]
160
161 challenge_results = req(
162 self._CHALLENGE_URL, challenge_req,
163 'Logging in', 'Unable to log in')
164
165 if challenge_results is False:
166 return
167
168 login_res = try_get(challenge_results, lambda x: x[0][5], list)
169 if login_res:
170 login_msg = try_get(login_res, lambda x: x[5], compat_str)
171 warn(
172 'Unable to login: %s' % 'Invalid password'
173 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
174 return False
175
176 res = try_get(challenge_results, lambda x: x[0][-1], list)
177 if not res:
178 warn('Unable to extract result entry')
179 return False
180
181 login_challenge = try_get(res, lambda x: x[0][0], list)
182 if login_challenge:
183 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
184 if challenge_str == 'TWO_STEP_VERIFICATION':
185 # SEND_SUCCESS - TFA code has been successfully sent to phone
186 # QUOTA_EXCEEDED - reached the limit of TFA codes
187 status = try_get(login_challenge, lambda x: x[5], compat_str)
188 if status == 'QUOTA_EXCEEDED':
189 warn('Exceeded the limit of TFA codes, try later')
190 return False
191
192 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
193 if not tl:
194 warn('Unable to extract TL')
195 return False
196
197 tfa_code = self._get_tfa_info('2-step verification code')
198
199 if not tfa_code:
200 warn(
201 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
202 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
203 return False
204
205 tfa_code = remove_start(tfa_code, 'G-')
206
207 tfa_req = [
208 user_hash, None, 2, None,
209 [
210 9, None, None, None, None, None, None, None,
211 [None, tfa_code, True, 2]
212 ]]
213
214 tfa_results = req(
215 self._TFA_URL.format(tl), tfa_req,
216 'Submitting TFA code', 'Unable to submit TFA code')
217
218 if tfa_results is False:
219 return False
220
221 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
222 if tfa_res:
223 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
224 warn(
225 'Unable to finish TFA: %s' % 'Invalid TFA code'
226 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
227 return False
228
229 check_cookie_url = try_get(
230 tfa_results, lambda x: x[0][-1][2], compat_str)
231 else:
232 CHALLENGES = {
233 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
234 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
235 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
236 }
237 challenge = CHALLENGES.get(
238 challenge_str,
239 '%s returned error %s.' % (self.IE_NAME, challenge_str))
240 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
241 return False
242 else:
243 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
244
245 if not check_cookie_url:
246 warn('Unable to extract CheckCookie URL')
247 return False
248
249 check_cookie_results = self._download_webpage(
250 check_cookie_url, None, 'Checking cookie', fatal=False)
251
252 if check_cookie_results is False:
253 return False
254
255 if 'https://myaccount.google.com/' not in check_cookie_results:
256 warn('Unable to log in')
257 return False
258
259 return True
260
261 def _download_webpage_handle(self, *args, **kwargs):
262 query = kwargs.get('query', {}).copy()
263 query['disable_polymer'] = 'true'
264 kwargs['query'] = query
265 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
266 *args, **compat_kwargs(kwargs))
267
268 def _real_initialize(self):
269 if self._downloader is None:
270 return
271 self._set_language()
272 if not self._login():
273 return
274
275
276 class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
277 # Extract entries from page with "Load more" button
278 def _entries(self, page, playlist_id):
279 more_widget_html = content_html = page
280 for page_num in itertools.count(1):
281 for entry in self._process_page(content_html):
282 yield entry
283
284 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
285 if not mobj:
286 break
287
288 more = self._download_json(
289 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
290 'Downloading page #%s' % page_num,
291 transform_source=uppercase_escape)
292 content_html = more['content_html']
293 if not content_html.strip():
294 # Some webpages show a "Load more" button but they don't
295 # have more videos
296 break
297 more_widget_html = more['load_more_widget_html']
298
299
300 class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
301 def _process_page(self, content):
302 for video_id, video_title in self.extract_videos_from_page(content):
303 yield self.url_result(video_id, 'Youtube', video_id, video_title)
304
305 def extract_videos_from_page(self, page):
306 ids_in_page = []
307 titles_in_page = []
308 for mobj in re.finditer(self._VIDEO_RE, page):
309 # The link with index 0 is not the first video of the playlist (not sure if still actual)
310 if 'index' in mobj.groupdict() and mobj.group('id') == '0':
311 continue
312 video_id = mobj.group('id')
313 video_title = unescapeHTML(mobj.group('title'))
314 if video_title:
315 video_title = video_title.strip()
316 try:
317 idx = ids_in_page.index(video_id)
318 if video_title and not titles_in_page[idx]:
319 titles_in_page[idx] = video_title
320 except ValueError:
321 ids_in_page.append(video_id)
322 titles_in_page.append(video_title)
323 return zip(ids_in_page, titles_in_page)
324
325
326 class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
327 def _process_page(self, content):
328 for playlist_id in orderedSet(re.findall(
329 r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',
330 content)):
331 yield self.url_result(
332 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
333
334 def _real_extract(self, url):
335 playlist_id = self._match_id(url)
336 webpage = self._download_webpage(url, playlist_id)
337 title = self._og_search_title(webpage, fatal=False)
338 return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
339
340
341 class YoutubeIE(YoutubeBaseInfoExtractor):
342 IE_DESC = 'YouTube.com'
343 _VALID_URL = r"""(?x)^
344 (
345 (?:https?://|//) # http(s):// or protocol-independent URL
346 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
347 (?:www\.)?deturl\.com/www\.youtube\.com/|
348 (?:www\.)?pwnyoutube\.com/|
349 (?:www\.)?hooktube\.com/|
350 (?:www\.)?yourepeat\.com/|
351 tube\.majestyc\.net/|
352 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
353 (?:.*?\#/)? # handle anchor (#/) redirect urls
354 (?: # the various things that can precede the ID:
355 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
356 |(?: # or the v= param in all its forms
357 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
358 (?:\?|\#!?) # the params delimiter ? or # or #!
359 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
360 v=
361 )
362 ))
363 |(?:
364 youtu\.be| # just youtu.be/xxxx
365 vid\.plus| # or vid.plus/xxxx
366 zwearz\.com/watch| # or zwearz.com/watch/xxxx
367 )/
368 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
369 )
370 )? # all until now is optional -> you can pass the naked ID
371 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
372 (?!.*?\blist=
373 (?:
374 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
375 WL # WL are handled by the watch later IE
376 )
377 )
378 (?(1).+)? # if we found the ID, everything can follow
379 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
380 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
381 _formats = {
382 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
383 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
384 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
385 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
386 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
387 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
388 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
389 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
390 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
391 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
392 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
393 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
394 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
395 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
396 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
397 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
398 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
399 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
400
401
402 # 3D videos
403 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
404 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
405 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
406 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
407 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
408 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
409 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
410
411 # Apple HTTP Live Streaming
412 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
413 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
414 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
415 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
416 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
417 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
418 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
419 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
420
421 # DASH mp4 video
422 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
423 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
424 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
425 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
426 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
427 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
428 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
429 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
430 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
431 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
432 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
433 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
434
435 # Dash mp4 audio
436 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
437 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
438 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
439 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
440 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
441 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
442 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
443
444 # Dash webm
445 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
446 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
447 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
448 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
449 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
450 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
451 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
452 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
453 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
454 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
455 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
456 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
457 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
458 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
459 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
460 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
461 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
462 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
463 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
464 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
465 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
466 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
467
468 # Dash webm audio
469 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
470 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
471
472 # Dash webm audio with opus inside
473 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
474 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
475 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
476
477 # RTMP (unnamed)
478 '_rtmp': {'protocol': 'rtmp'},
479 }
480 _SUBTITLE_FORMATS = ('ttml', 'vtt')
481
482 _GEO_BYPASS = False
483
484 IE_NAME = 'youtube'
485 _TESTS = [
486 {
487 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
488 'info_dict': {
489 'id': 'BaW_jenozKc',
490 'ext': 'mp4',
491 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
492 'uploader': 'Philipp Hagemeister',
493 'uploader_id': 'phihag',
494 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
495 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
496 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
497 'upload_date': '20121002',
498 'license': 'Standard YouTube License',
499 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
500 'categories': ['Science & Technology'],
501 'tags': ['youtube-dl'],
502 'duration': 10,
503 'like_count': int,
504 'dislike_count': int,
505 'start_time': 1,
506 'end_time': 9,
507 }
508 },
509 {
510 'url': 'https://www.youtube.com/watch?v=UxxajLWwzqY',
511 'note': 'Test generic use_cipher_signature video (#897)',
512 'info_dict': {
513 'id': 'UxxajLWwzqY',
514 'ext': 'mp4',
515 'upload_date': '20120506',
516 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
517 'alt_title': 'I Love It (feat. Charli XCX)',
518 'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8',
519 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
520 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
521 'iconic ep', 'iconic', 'love', 'it'],
522 'duration': 180,
523 'uploader': 'Icona Pop',
524 'uploader_id': 'IconaPop',
525 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop',
526 'license': 'Standard YouTube License',
527 'creator': 'Icona Pop',
528 'track': 'I Love It (feat. Charli XCX)',
529 'artist': 'Icona Pop',
530 }
531 },
532 {
533 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
534 'note': 'Test VEVO video with age protection (#956)',
535 'info_dict': {
536 'id': '07FYdnEawAQ',
537 'ext': 'mp4',
538 'upload_date': '20130703',
539 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
540 'alt_title': 'Tunnel Vision',
541 'description': 'md5:64249768eec3bc4276236606ea996373',
542 'duration': 419,
543 'uploader': 'justintimberlakeVEVO',
544 'uploader_id': 'justintimberlakeVEVO',
545 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',
546 'license': 'Standard YouTube License',
547 'creator': 'Justin Timberlake',
548 'track': 'Tunnel Vision',
549 'artist': 'Justin Timberlake',
550 'age_limit': 18,
551 }
552 },
553 {
554 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
555 'note': 'Embed-only video (#1746)',
556 'info_dict': {
557 'id': 'yZIXLfi8CZQ',
558 'ext': 'mp4',
559 'upload_date': '20120608',
560 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
561 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
562 'uploader': 'SET India',
563 'uploader_id': 'setindia',
564 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
565 'license': 'Standard YouTube License',
566 'age_limit': 18,
567 }
568 },
569 {
570 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
571 'note': 'Use the first video ID in the URL',
572 'info_dict': {
573 'id': 'BaW_jenozKc',
574 'ext': 'mp4',
575 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
576 'uploader': 'Philipp Hagemeister',
577 'uploader_id': 'phihag',
578 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
579 'upload_date': '20121002',
580 'license': 'Standard YouTube License',
581 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
582 'categories': ['Science & Technology'],
583 'tags': ['youtube-dl'],
584 'duration': 10,
585 'like_count': int,
586 'dislike_count': int,
587 },
588 'params': {
589 'skip_download': True,
590 },
591 },
592 {
593 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
594 'note': '256k DASH audio (format 141) via DASH manifest',
595 'info_dict': {
596 'id': 'a9LDPn-MO4I',
597 'ext': 'm4a',
598 'upload_date': '20121002',
599 'uploader_id': '8KVIDEO',
600 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
601 'description': '',
602 'uploader': '8KVIDEO',
603 'license': 'Standard YouTube License',
604 'title': 'UHDTV TEST 8K VIDEO.mp4'
605 },
606 'params': {
607 'youtube_include_dash_manifest': True,
608 'format': '141',
609 },
610 'skip': 'format 141 not served anymore',
611 },
612 # DASH manifest with encrypted signature
613 {
614 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
615 'info_dict': {
616 'id': 'IB3lcPjvWLA',
617 'ext': 'm4a',
618 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
619 'description': 'md5:1900ed86ee514927b9e00fbead6969a5',
620 'duration': 244,
621 'uploader': 'AfrojackVEVO',
622 'uploader_id': 'AfrojackVEVO',
623 'upload_date': '20131011',
624 'license': 'Standard YouTube License',
625 },
626 'params': {
627 'youtube_include_dash_manifest': True,
628 'format': '141/bestaudio[ext=m4a]',
629 },
630 },
631 # JS player signature function name containing $
632 {
633 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
634 'info_dict': {
635 'id': 'nfWlot6h_JM',
636 'ext': 'm4a',
637 'title': 'Taylor Swift - Shake It Off',
638 'alt_title': 'Shake It Off',
639 'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3',
640 'duration': 242,
641 'uploader': 'TaylorSwiftVEVO',
642 'uploader_id': 'TaylorSwiftVEVO',
643 'upload_date': '20140818',
644 'license': 'Standard YouTube License',
645 'creator': 'Taylor Swift',
646 },
647 'params': {
648 'youtube_include_dash_manifest': True,
649 'format': '141/bestaudio[ext=m4a]',
650 },
651 },
652 # Controversy video
653 {
654 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
655 'info_dict': {
656 'id': 'T4XJQO3qol8',
657 'ext': 'mp4',
658 'duration': 219,
659 'upload_date': '20100909',
660 'uploader': 'TJ Kirk',
661 'uploader_id': 'TheAmazingAtheist',
662 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
663 'license': 'Standard YouTube License',
664 'title': 'Burning Everyone\'s Koran',
665 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
666 }
667 },
668 # Normal age-gate video (No vevo, embed allowed)
669 {
670 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
671 'info_dict': {
672 'id': 'HtVdAasjOgU',
673 'ext': 'mp4',
674 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
675 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
676 'duration': 142,
677 'uploader': 'The Witcher',
678 'uploader_id': 'WitcherGame',
679 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
680 'upload_date': '20140605',
681 'license': 'Standard YouTube License',
682 'age_limit': 18,
683 },
684 },
685 # Age-gate video with encrypted signature
686 {
687 'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU',
688 'info_dict': {
689 'id': '6kLq3WMV1nU',
690 'ext': 'webm',
691 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
692 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
693 'duration': 246,
694 'uploader': 'LloydVEVO',
695 'uploader_id': 'LloydVEVO',
696 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO',
697 'upload_date': '20110629',
698 'license': 'Standard YouTube License',
699 'age_limit': 18,
700 },
701 },
702 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
703 # YouTube Red ad is not captured for creator
704 {
705 'url': '__2ABJjxzNo',
706 'info_dict': {
707 'id': '__2ABJjxzNo',
708 'ext': 'mp4',
709 'duration': 266,
710 'upload_date': '20100430',
711 'uploader_id': 'deadmau5',
712 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
713 'creator': 'deadmau5',
714 'description': 'md5:12c56784b8032162bb936a5f76d55360',
715 'uploader': 'deadmau5',
716 'license': 'Standard YouTube License',
717 'title': 'Deadmau5 - Some Chords (HD)',
718 'alt_title': 'Some Chords',
719 },
720 'expected_warnings': [
721 'DASH manifest missing',
722 ]
723 },
724 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
725 {
726 'url': 'lqQg6PlCWgI',
727 'info_dict': {
728 'id': 'lqQg6PlCWgI',
729 'ext': 'mp4',
730 'duration': 6085,
731 'upload_date': '20150827',
732 'uploader_id': 'olympic',
733 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
734 'license': 'Standard YouTube License',
735 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
736 'uploader': 'Olympic',
737 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
738 },
739 'params': {
740 'skip_download': 'requires avconv',
741 }
742 },
743 # Non-square pixels
744 {
745 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
746 'info_dict': {
747 'id': '_b-2C3KPAM0',
748 'ext': 'mp4',
749 'stretched_ratio': 16 / 9.,
750 'duration': 85,
751 'upload_date': '20110310',
752 'uploader_id': 'AllenMeow',
753 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
754 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
755 'uploader': '孫ᄋᄅ',
756 'license': 'Standard YouTube License',
757 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
758 },
759 },
760 # url_encoded_fmt_stream_map is empty string
761 {
762 'url': 'qEJwOuvDf7I',
763 'info_dict': {
764 'id': 'qEJwOuvDf7I',
765 'ext': 'webm',
766 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
767 'description': '',
768 'upload_date': '20150404',
769 'uploader_id': 'spbelect',
770 'uploader': 'Наблюдатели Петербурга',
771 },
772 'params': {
773 'skip_download': 'requires avconv',
774 },
775 'skip': 'This live event has ended.',
776 },
777 # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)
778 {
779 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
780 'info_dict': {
781 'id': 'FIl7x6_3R5Y',
782 'ext': 'webm',
783 'title': 'md5:7b81415841e02ecd4313668cde88737a',
784 'description': 'md5:116377fd2963b81ec4ce64b542173306',
785 'duration': 220,
786 'upload_date': '20150625',
787 'uploader_id': 'dorappi2000',
788 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
789 'uploader': 'dorappi2000',
790 'license': 'Standard YouTube License',
791 'formats': 'mincount:31',
792 },
793 'skip': 'not actual anymore',
794 },
795 # DASH manifest with segment_list
796 {
797 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
798 'md5': '8ce563a1d667b599d21064e982ab9e31',
799 'info_dict': {
800 'id': 'CsmdDsKjzN8',
801 'ext': 'mp4',
802 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
803 'uploader': 'Airtek',
804 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
805 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
806 'license': 'Standard YouTube License',
807 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
808 },
809 'params': {
810 'youtube_include_dash_manifest': True,
811 'format': '135', # bestvideo
812 },
813 'skip': 'This live event has ended.',
814 },
815 {
816 # Multifeed videos (multiple cameras), URL is for Main Camera
817 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
818 'info_dict': {
819 'id': 'jqWvoWXjCVs',
820 'title': 'teamPGP: Rocket League Noob Stream',
821 'description': 'md5:dc7872fb300e143831327f1bae3af010',
822 },
823 'playlist': [{
824 'info_dict': {
825 'id': 'jqWvoWXjCVs',
826 'ext': 'mp4',
827 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
828 'description': 'md5:dc7872fb300e143831327f1bae3af010',
829 'duration': 7335,
830 'upload_date': '20150721',
831 'uploader': 'Beer Games Beer',
832 'uploader_id': 'beergamesbeer',
833 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
834 'license': 'Standard YouTube License',
835 },
836 }, {
837 'info_dict': {
838 'id': '6h8e8xoXJzg',
839 'ext': 'mp4',
840 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
841 'description': 'md5:dc7872fb300e143831327f1bae3af010',
842 'duration': 7337,
843 'upload_date': '20150721',
844 'uploader': 'Beer Games Beer',
845 'uploader_id': 'beergamesbeer',
846 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
847 'license': 'Standard YouTube License',
848 },
849 }, {
850 'info_dict': {
851 'id': 'PUOgX5z9xZw',
852 'ext': 'mp4',
853 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
854 'description': 'md5:dc7872fb300e143831327f1bae3af010',
855 'duration': 7337,
856 'upload_date': '20150721',
857 'uploader': 'Beer Games Beer',
858 'uploader_id': 'beergamesbeer',
859 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
860 'license': 'Standard YouTube License',
861 },
862 }, {
863 'info_dict': {
864 'id': 'teuwxikvS5k',
865 'ext': 'mp4',
866 'title': 'teamPGP: Rocket League Noob Stream (zim)',
867 'description': 'md5:dc7872fb300e143831327f1bae3af010',
868 'duration': 7334,
869 'upload_date': '20150721',
870 'uploader': 'Beer Games Beer',
871 'uploader_id': 'beergamesbeer',
872 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
873 'license': 'Standard YouTube License',
874 },
875 }],
876 'params': {
877 'skip_download': True,
878 },
879 },
880 {
881 # Multifeed video with comma in title (see https://github.com/rg3/youtube-dl/issues/8536)
882 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
883 'info_dict': {
884 'id': 'gVfLd0zydlo',
885 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
886 },
887 'playlist_count': 2,
888 'skip': 'Not multifeed anymore',
889 },
890 {
891 'url': 'https://vid.plus/FlRa-iH7PGw',
892 'only_matching': True,
893 },
894 {
895 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
896 'only_matching': True,
897 },
898 {
899 # Title with JS-like syntax "};" (see https://github.com/rg3/youtube-dl/issues/7468)
900 # Also tests cut-off URL expansion in video description (see
901 # https://github.com/rg3/youtube-dl/issues/1892,
902 # https://github.com/rg3/youtube-dl/issues/8164)
903 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
904 'info_dict': {
905 'id': 'lsguqyKfVQg',
906 'ext': 'mp4',
907 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
908 'alt_title': 'Dark Walk - Position Music',
909 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
910 'duration': 133,
911 'upload_date': '20151119',
912 'uploader_id': 'IronSoulElf',
913 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
914 'uploader': 'IronSoulElf',
915 'license': 'Standard YouTube License',
916 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
917 'track': 'Dark Walk - Position Music',
918 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
919 },
920 'params': {
921 'skip_download': True,
922 },
923 },
924 {
925 # Tags with '};' (see https://github.com/rg3/youtube-dl/issues/7468)
926 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
927 'only_matching': True,
928 },
929 {
930 # Video with yt:stretch=17:0
931 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
932 'info_dict': {
933 'id': 'Q39EVAstoRM',
934 'ext': 'mp4',
935 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
936 'description': 'md5:ee18a25c350637c8faff806845bddee9',
937 'upload_date': '20151107',
938 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
939 'uploader': 'CH GAMER DROID',
940 },
941 'params': {
942 'skip_download': True,
943 },
944 'skip': 'This video does not exist.',
945 },
946 {
947 # Video licensed under Creative Commons
948 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
949 'info_dict': {
950 'id': 'M4gD1WSo5mA',
951 'ext': 'mp4',
952 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
953 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
954 'duration': 721,
955 'upload_date': '20150127',
956 'uploader_id': 'BerkmanCenter',
957 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
958 'uploader': 'The Berkman Klein Center for Internet & Society',
959 'license': 'Creative Commons Attribution license (reuse allowed)',
960 },
961 'params': {
962 'skip_download': True,
963 },
964 },
965 {
966 # Channel-like uploader_url
967 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
968 'info_dict': {
969 'id': 'eQcmzGIKrzg',
970 'ext': 'mp4',
971 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
972 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
973 'duration': 4060,
974 'upload_date': '20151119',
975 'uploader': 'Bernie Sanders',
976 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
977 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
978 'license': 'Creative Commons Attribution license (reuse allowed)',
979 },
980 'params': {
981 'skip_download': True,
982 },
983 },
984 {
985 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
986 'only_matching': True,
987 },
988 {
989 # YouTube Red paid video (https://github.com/rg3/youtube-dl/issues/10059)
990 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
991 'only_matching': True,
992 },
993 {
994 # Rental video preview
995 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
996 'info_dict': {
997 'id': 'uGpuVWrhIzE',
998 'ext': 'mp4',
999 'title': 'Piku - Trailer',
1000 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
1001 'upload_date': '20150811',
1002 'uploader': 'FlixMatrix',
1003 'uploader_id': 'FlixMatrixKaravan',
1004 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
1005 'license': 'Standard YouTube License',
1006 },
1007 'params': {
1008 'skip_download': True,
1009 },
1010 'skip': 'This video is not available.',
1011 },
1012 {
1013 # YouTube Red video with episode data
1014 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
1015 'info_dict': {
1016 'id': 'iqKdEhx-dD4',
1017 'ext': 'mp4',
1018 'title': 'Isolation - Mind Field (Ep 1)',
1019 'description': 'md5:25b78d2f64ae81719f5c96319889b736',
1020 'duration': 2085,
1021 'upload_date': '20170118',
1022 'uploader': 'Vsauce',
1023 'uploader_id': 'Vsauce',
1024 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
1025 'license': 'Standard YouTube License',
1026 'series': 'Mind Field',
1027 'season_number': 1,
1028 'episode_number': 1,
1029 },
1030 'params': {
1031 'skip_download': True,
1032 },
1033 'expected_warnings': [
1034 'Skipping DASH manifest',
1035 ],
1036 },
1037 {
1038 # The following content has been identified by the YouTube community
1039 # as inappropriate or offensive to some audiences.
1040 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
1041 'info_dict': {
1042 'id': '6SJNVb0GnPI',
1043 'ext': 'mp4',
1044 'title': 'Race Differences in Intelligence',
1045 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
1046 'duration': 965,
1047 'upload_date': '20140124',
1048 'uploader': 'New Century Foundation',
1049 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
1050 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
1051 'license': 'Standard YouTube License',
1052 },
1053 'params': {
1054 'skip_download': True,
1055 },
1056 },
1057 {
1058 # itag 212
1059 'url': '1t24XAntNCY',
1060 'only_matching': True,
1061 },
1062 {
1063 # geo restricted to JP
1064 'url': 'sJL6WA-aGkQ',
1065 'only_matching': True,
1066 },
1067 {
1068 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
1069 'only_matching': True,
1070 },
1071 ]
1072
1073 def __init__(self, *args, **kwargs):
1074 super(YoutubeIE, self).__init__(*args, **kwargs)
1075 self._player_cache = {}
1076
1077 def report_video_info_webpage_download(self, video_id):
1078 """Report attempt to download video info webpage."""
1079 self.to_screen('%s: Downloading video info webpage' % video_id)
1080
1081 def report_information_extraction(self, video_id):
1082 """Report attempt to extract video information."""
1083 self.to_screen('%s: Extracting video information' % video_id)
1084
1085 def report_unavailable_format(self, video_id, format):
1086 """Report extracted video URL."""
1087 self.to_screen('%s: Format %s not available' % (video_id, format))
1088
1089 def report_rtmp_download(self):
1090 """Indicate the download will use the RTMP protocol."""
1091 self.to_screen('RTMP download detected')
1092
1093 def _signature_cache_id(self, example_sig):
1094 """ Return a string representation of a signature """
1095 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
1096
1097 def _extract_signature_function(self, video_id, player_url, example_sig):
1098 id_m = re.match(
1099 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$',
1100 player_url)
1101 if not id_m:
1102 raise ExtractorError('Cannot identify player %r' % player_url)
1103 player_type = id_m.group('ext')
1104 player_id = id_m.group('id')
1105
1106 # Read from filesystem cache
1107 func_id = '%s_%s_%s' % (
1108 player_type, player_id, self._signature_cache_id(example_sig))
1109 assert os.path.basename(func_id) == func_id
1110
1111 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
1112 if cache_spec is not None:
1113 return lambda s: ''.join(s[i] for i in cache_spec)
1114
1115 download_note = (
1116 'Downloading player %s' % player_url
1117 if self._downloader.params.get('verbose') else
1118 'Downloading %s player %s' % (player_type, player_id)
1119 )
1120 if player_type == 'js':
1121 code = self._download_webpage(
1122 player_url, video_id,
1123 note=download_note,
1124 errnote='Download of %s failed' % player_url)
1125 res = self._parse_sig_js(code)
1126 elif player_type == 'swf':
1127 urlh = self._request_webpage(
1128 player_url, video_id,
1129 note=download_note,
1130 errnote='Download of %s failed' % player_url)
1131 code = urlh.read()
1132 res = self._parse_sig_swf(code)
1133 else:
1134 assert False, 'Invalid player type %r' % player_type
1135
1136 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1137 cache_res = res(test_string)
1138 cache_spec = [ord(c) for c in cache_res]
1139
1140 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
1141 return res
1142
1143 def _print_sig_code(self, func, example_sig):
1144 def gen_sig_code(idxs):
1145 def _genslice(start, end, step):
1146 starts = '' if start == 0 else str(start)
1147 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
1148 steps = '' if step == 1 else (':%d' % step)
1149 return 's[%s%s%s]' % (starts, ends, steps)
1150
1151 step = None
1152 # Quelch pyflakes warnings - start will be set when step is set
1153 start = '(Never used)'
1154 for i, prev in zip(idxs[1:], idxs[:-1]):
1155 if step is not None:
1156 if i - prev == step:
1157 continue
1158 yield _genslice(start, prev, step)
1159 step = None
1160 continue
1161 if i - prev in [-1, 1]:
1162 step = i - prev
1163 start = prev
1164 continue
1165 else:
1166 yield 's[%d]' % prev
1167 if step is None:
1168 yield 's[%d]' % i
1169 else:
1170 yield _genslice(start, i, step)
1171
1172 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1173 cache_res = func(test_string)
1174 cache_spec = [ord(c) for c in cache_res]
1175 expr_code = ' + '.join(gen_sig_code(cache_spec))
1176 signature_id_tuple = '(%s)' % (
1177 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
1178 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
1179 ' return %s\n') % (signature_id_tuple, expr_code)
1180 self.to_screen('Extracted signature function:\n' + code)
1181
1182 def _parse_sig_js(self, jscode):
1183 funcname = self._search_regex(
1184 (r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1185 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
1186 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1187 r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\('),
1188 jscode, 'Initial JS player signature function name', group='sig')
1189
1190 jsi = JSInterpreter(jscode)
1191 initial_function = jsi.extract_function(funcname)
1192 return lambda s: initial_function([s])
1193
1194 def _parse_sig_swf(self, file_contents):
1195 swfi = SWFInterpreter(file_contents)
1196 TARGET_CLASSNAME = 'SignatureDecipher'
1197 searched_class = swfi.extract_class(TARGET_CLASSNAME)
1198 initial_function = swfi.extract_function(searched_class, 'decipher')
1199 return lambda s: initial_function([s])
1200
1201 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1202 """Turn the encrypted s field into a working signature"""
1203
1204 if player_url is None:
1205 raise ExtractorError('Cannot decrypt signature without player_url')
1206
1207 if player_url.startswith('//'):
1208 player_url = 'https:' + player_url
1209 elif not re.match(r'https?://', player_url):
1210 player_url = compat_urlparse.urljoin(
1211 'https://www.youtube.com', player_url)
1212 try:
1213 player_id = (player_url, self._signature_cache_id(s))
1214 if player_id not in self._player_cache:
1215 func = self._extract_signature_function(
1216 video_id, player_url, s
1217 )
1218 self._player_cache[player_id] = func
1219 func = self._player_cache[player_id]
1220 if self._downloader.params.get('youtube_print_sig_code'):
1221 self._print_sig_code(func, s)
1222 return func(s)
1223 except Exception as e:
1224 tb = traceback.format_exc()
1225 raise ExtractorError(
1226 'Signature extraction failed: ' + tb, cause=e)
1227
1228 def _get_subtitles(self, video_id, webpage):
1229 try:
1230 subs_doc = self._download_xml(
1231 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1232 video_id, note=False)
1233 except ExtractorError as err:
1234 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
1235 return {}
1236
1237 sub_lang_list = {}
1238 for track in subs_doc.findall('track'):
1239 lang = track.attrib['lang_code']
1240 if lang in sub_lang_list:
1241 continue
1242 sub_formats = []
1243 for ext in self._SUBTITLE_FORMATS:
1244 params = compat_urllib_parse_urlencode({
1245 'lang': lang,
1246 'v': video_id,
1247 'fmt': ext,
1248 'name': track.attrib['name'].encode('utf-8'),
1249 })
1250 sub_formats.append({
1251 'url': 'https://www.youtube.com/api/timedtext?' + params,
1252 'ext': ext,
1253 })
1254 sub_lang_list[lang] = sub_formats
1255 if not sub_lang_list:
1256 self._downloader.report_warning('video doesn\'t have subtitles')
1257 return {}
1258 return sub_lang_list
1259
1260 def _get_ytplayer_config(self, video_id, webpage):
1261 patterns = (
1262 # User data may contain arbitrary character sequences that may affect
1263 # JSON extraction with regex, e.g. when '};' is contained the second
1264 # regex won't capture the whole JSON. Yet working around by trying more
1265 # concrete regex first keeping in mind proper quoted string handling
1266 # to be implemented in future that will replace this workaround (see
1267 # https://github.com/rg3/youtube-dl/issues/7468,
1268 # https://github.com/rg3/youtube-dl/pull/7599)
1269 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1270 r';ytplayer\.config\s*=\s*({.+?});',
1271 )
1272 config = self._search_regex(
1273 patterns, webpage, 'ytplayer.config', default=None)
1274 if config:
1275 return self._parse_json(
1276 uppercase_escape(config), video_id, fatal=False)
1277
1278 def _get_automatic_captions(self, video_id, webpage):
1279 """We need the webpage for getting the captions url, pass it as an
1280 argument to speed up the process."""
1281 self.to_screen('%s: Looking for automatic captions' % video_id)
1282 player_config = self._get_ytplayer_config(video_id, webpage)
1283 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
1284 if not player_config:
1285 self._downloader.report_warning(err_msg)
1286 return {}
1287 try:
1288 args = player_config['args']
1289 caption_url = args.get('ttsurl')
1290 if caption_url:
1291 timestamp = args['timestamp']
1292 # We get the available subtitles
1293 list_params = compat_urllib_parse_urlencode({
1294 'type': 'list',
1295 'tlangs': 1,
1296 'asrs': 1,
1297 })
1298 list_url = caption_url + '&' + list_params
1299 caption_list = self._download_xml(list_url, video_id)
1300 original_lang_node = caption_list.find('track')
1301 if original_lang_node is None:
1302 self._downloader.report_warning('Video doesn\'t have automatic captions')
1303 return {}
1304 original_lang = original_lang_node.attrib['lang_code']
1305 caption_kind = original_lang_node.attrib.get('kind', '')
1306
1307 sub_lang_list = {}
1308 for lang_node in caption_list.findall('target'):
1309 sub_lang = lang_node.attrib['lang_code']
1310 sub_formats = []
1311 for ext in self._SUBTITLE_FORMATS:
1312 params = compat_urllib_parse_urlencode({
1313 'lang': original_lang,
1314 'tlang': sub_lang,
1315 'fmt': ext,
1316 'ts': timestamp,
1317 'kind': caption_kind,
1318 })
1319 sub_formats.append({
1320 'url': caption_url + '&' + params,
1321 'ext': ext,
1322 })
1323 sub_lang_list[sub_lang] = sub_formats
1324 return sub_lang_list
1325
1326 def make_captions(sub_url, sub_langs):
1327 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1328 caption_qs = compat_parse_qs(parsed_sub_url.query)
1329 captions = {}
1330 for sub_lang in sub_langs:
1331 sub_formats = []
1332 for ext in self._SUBTITLE_FORMATS:
1333 caption_qs.update({
1334 'tlang': [sub_lang],
1335 'fmt': [ext],
1336 })
1337 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1338 query=compat_urllib_parse_urlencode(caption_qs, True)))
1339 sub_formats.append({
1340 'url': sub_url,
1341 'ext': ext,
1342 })
1343 captions[sub_lang] = sub_formats
1344 return captions
1345
1346 # New captions format as of 22.06.2017
1347 player_response = args.get('player_response')
1348 if player_response and isinstance(player_response, compat_str):
1349 player_response = self._parse_json(
1350 player_response, video_id, fatal=False)
1351 if player_response:
1352 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1353 base_url = renderer['captionTracks'][0]['baseUrl']
1354 sub_lang_list = []
1355 for lang in renderer['translationLanguages']:
1356 lang_code = lang.get('languageCode')
1357 if lang_code:
1358 sub_lang_list.append(lang_code)
1359 return make_captions(base_url, sub_lang_list)
1360
1361 # Some videos don't provide ttsurl but rather caption_tracks and
1362 # caption_translation_languages (e.g. 20LmZk1hakA)
1363 # Does not used anymore as of 22.06.2017
1364 caption_tracks = args['caption_tracks']
1365 caption_translation_languages = args['caption_translation_languages']
1366 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
1367 sub_lang_list = []
1368 for lang in caption_translation_languages.split(','):
1369 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1370 sub_lang = lang_qs.get('lc', [None])[0]
1371 if sub_lang:
1372 sub_lang_list.append(sub_lang)
1373 return make_captions(caption_url, sub_lang_list)
1374 # An extractor error can be raise by the download process if there are
1375 # no automatic captions but there are subtitles
1376 except (KeyError, IndexError, ExtractorError):
1377 self._downloader.report_warning(err_msg)
1378 return {}
1379
1380 def _mark_watched(self, video_id, video_info):
1381 playback_url = video_info.get('videostats_playback_base_url', [None])[0]
1382 if not playback_url:
1383 return
1384 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1385 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1386
1387 # cpn generation algorithm is reverse engineered from base.js.
1388 # In fact it works even with dummy cpn.
1389 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1390 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1391
1392 qs.update({
1393 'ver': ['2'],
1394 'cpn': [cpn],
1395 })
1396 playback_url = compat_urlparse.urlunparse(
1397 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
1398
1399 self._download_webpage(
1400 playback_url, video_id, 'Marking watched',
1401 'Unable to mark watched', fatal=False)
1402
1403 @staticmethod
1404 def _extract_urls(webpage):
1405 # Embedded YouTube player
1406 entries = [
1407 unescapeHTML(mobj.group('url'))
1408 for mobj in re.finditer(r'''(?x)
1409 (?:
1410 <iframe[^>]+?src=|
1411 data-video-url=|
1412 <embed[^>]+?src=|
1413 embedSWF\(?:\s*|
1414 <object[^>]+data=|
1415 new\s+SWFObject\(
1416 )
1417 (["\'])
1418 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1419 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
1420 \1''', webpage)]
1421
1422 # lazyYT YouTube embed
1423 entries.extend(list(map(
1424 unescapeHTML,
1425 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1426
1427 # Wordpress "YouTube Video Importer" plugin
1428 matches = re.findall(r'''(?x)<div[^>]+
1429 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1430 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1431 entries.extend(m[-1] for m in matches)
1432
1433 return entries
1434
1435 @staticmethod
1436 def _extract_url(webpage):
1437 urls = YoutubeIE._extract_urls(webpage)
1438 return urls[0] if urls else None
1439
1440 @classmethod
1441 def extract_id(cls, url):
1442 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1443 if mobj is None:
1444 raise ExtractorError('Invalid URL: %s' % url)
1445 video_id = mobj.group(2)
1446 return video_id
1447
1448 def _extract_annotations(self, video_id):
1449 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1450 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1451
1452 @staticmethod
1453 def _extract_chapters(description, duration):
1454 if not description:
1455 return None
1456 chapter_lines = re.findall(
1457 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1458 description)
1459 if not chapter_lines:
1460 return None
1461 chapters = []
1462 for next_num, (chapter_line, time_point) in enumerate(
1463 chapter_lines, start=1):
1464 start_time = parse_duration(time_point)
1465 if start_time is None:
1466 continue
1467 if start_time > duration:
1468 break
1469 end_time = (duration if next_num == len(chapter_lines)
1470 else parse_duration(chapter_lines[next_num][1]))
1471 if end_time is None:
1472 continue
1473 if end_time > duration:
1474 end_time = duration
1475 if start_time > end_time:
1476 break
1477 chapter_title = re.sub(
1478 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1479 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1480 chapters.append({
1481 'start_time': start_time,
1482 'end_time': end_time,
1483 'title': chapter_title,
1484 })
1485 return chapters
1486
1487 def _real_extract(self, url):
1488 url, smuggled_data = unsmuggle_url(url, {})
1489
1490 proto = (
1491 'http' if self._downloader.params.get('prefer_insecure', False)
1492 else 'https')
1493
1494 start_time = None
1495 end_time = None
1496 parsed_url = compat_urllib_parse_urlparse(url)
1497 for component in [parsed_url.fragment, parsed_url.query]:
1498 query = compat_parse_qs(component)
1499 if start_time is None and 't' in query:
1500 start_time = parse_duration(query['t'][0])
1501 if start_time is None and 'start' in query:
1502 start_time = parse_duration(query['start'][0])
1503 if end_time is None and 'end' in query:
1504 end_time = parse_duration(query['end'][0])
1505
1506 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1507 mobj = re.search(self._NEXT_URL_RE, url)
1508 if mobj:
1509 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
1510 video_id = self.extract_id(url)
1511
1512 # Get video webpage
1513 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
1514 video_webpage = self._download_webpage(url, video_id)
1515
1516 # Attempt to extract SWF player URL
1517 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1518 if mobj is not None:
1519 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1520 else:
1521 player_url = None
1522
1523 dash_mpds = []
1524
1525 def add_dash_mpd(video_info):
1526 dash_mpd = video_info.get('dashmpd')
1527 if dash_mpd and dash_mpd[0] not in dash_mpds:
1528 dash_mpds.append(dash_mpd[0])
1529
1530 is_live = None
1531 view_count = None
1532
1533 def extract_view_count(v_info):
1534 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1535
1536 # Get video info
1537 embed_webpage = None
1538 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1539 age_gate = True
1540 # We simulate the access to the video from www.youtube.com/v/{video_id}
1541 # this can be viewed without login into Youtube
1542 url = proto + '://www.youtube.com/embed/%s' % video_id
1543 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
1544 data = compat_urllib_parse_urlencode({
1545 'video_id': video_id,
1546 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1547 'sts': self._search_regex(
1548 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1549 })
1550 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1551 video_info_webpage = self._download_webpage(
1552 video_info_url, video_id,
1553 note='Refetching age-gated info webpage',
1554 errnote='unable to download video info webpage')
1555 video_info = compat_parse_qs(video_info_webpage)
1556 add_dash_mpd(video_info)
1557 else:
1558 age_gate = False
1559 video_info = None
1560 sts = None
1561 # Try looking directly into the video webpage
1562 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1563 if ytplayer_config:
1564 args = ytplayer_config['args']
1565 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1566 # Convert to the same format returned by compat_parse_qs
1567 video_info = dict((k, [v]) for k, v in args.items())
1568 add_dash_mpd(video_info)
1569 # Rental video is not rented but preview is available (e.g.
1570 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1571 # https://github.com/rg3/youtube-dl/issues/10532)
1572 if not video_info and args.get('ypc_vid'):
1573 return self.url_result(
1574 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1575 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1576 is_live = True
1577 sts = ytplayer_config.get('sts')
1578 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1579 # We also try looking in get_video_info since it may contain different dashmpd
1580 # URL that points to a DASH manifest with possibly different itag set (some itags
1581 # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
1582 # manifest pointed by get_video_info's dashmpd).
1583 # The general idea is to take a union of itags of both DASH manifests (for example
1584 # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
1585 self.report_video_info_webpage_download(video_id)
1586 for el in ('info', 'embedded', 'detailpage', 'vevo', ''):
1587 query = {
1588 'video_id': video_id,
1589 'ps': 'default',
1590 'eurl': '',
1591 'gl': 'US',
1592 'hl': 'en',
1593 }
1594 if el:
1595 query['el'] = el
1596 if sts:
1597 query['sts'] = sts
1598 video_info_webpage = self._download_webpage(
1599 '%s://www.youtube.com/get_video_info' % proto,
1600 video_id, note=False,
1601 errnote='unable to download video info webpage',
1602 fatal=False, query=query)
1603 if not video_info_webpage:
1604 continue
1605 get_video_info = compat_parse_qs(video_info_webpage)
1606 add_dash_mpd(get_video_info)
1607 if view_count is None:
1608 view_count = extract_view_count(get_video_info)
1609 if not video_info:
1610 video_info = get_video_info
1611 if 'token' in get_video_info:
1612 # Different get_video_info requests may report different results, e.g.
1613 # some may report video unavailability, but some may serve it without
1614 # any complaint (see https://github.com/rg3/youtube-dl/issues/7362,
1615 # the original webpage as well as el=info and el=embedded get_video_info
1616 # requests report video unavailability due to geo restriction while
1617 # el=detailpage succeeds and returns valid data). This is probably
1618 # due to YouTube measures against IP ranges of hosting providers.
1619 # Working around by preferring the first succeeded video_info containing
1620 # the token if no such video_info yet was found.
1621 if 'token' not in video_info:
1622 video_info = get_video_info
1623 break
1624
1625 def extract_unavailable_message():
1626 return self._html_search_regex(
1627 r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>',
1628 video_webpage, 'unavailable message', default=None)
1629
1630 if 'token' not in video_info:
1631 if 'reason' in video_info:
1632 if 'The uploader has not made this video available in your country.' in video_info['reason']:
1633 regions_allowed = self._html_search_meta(
1634 'regionsAllowed', video_webpage, default=None)
1635 countries = regions_allowed.split(',') if regions_allowed else None
1636 self.raise_geo_restricted(
1637 msg=video_info['reason'][0], countries=countries)
1638 reason = video_info['reason'][0]
1639 if 'Invalid parameters' in reason:
1640 unavailable_message = extract_unavailable_message()
1641 if unavailable_message:
1642 reason = unavailable_message
1643 raise ExtractorError(
1644 'YouTube said: %s' % reason,
1645 expected=True, video_id=video_id)
1646 else:
1647 raise ExtractorError(
1648 '"token" parameter not in video info for unknown reason',
1649 video_id=video_id)
1650
1651 # title
1652 if 'title' in video_info:
1653 video_title = video_info['title'][0]
1654 else:
1655 self._downloader.report_warning('Unable to extract video title')
1656 video_title = '_'
1657
1658 # description
1659 description_original = video_description = get_element_by_id("eow-description", video_webpage)
1660 if video_description:
1661
1662 def replace_url(m):
1663 redir_url = compat_urlparse.urljoin(url, m.group(1))
1664 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1665 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1666 qs = compat_parse_qs(parsed_redir_url.query)
1667 q = qs.get('q')
1668 if q and q[0]:
1669 return q[0]
1670 return redir_url
1671
1672 description_original = video_description = re.sub(r'''(?x)
1673 <a\s+
1674 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1675 (?:title|href)="([^"]+)"\s+
1676 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1677 class="[^"]*"[^>]*>
1678 [^<]+\.{3}\s*
1679 </a>
1680 ''', replace_url, video_description)
1681 video_description = clean_html(video_description)
1682 else:
1683 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1684 if fd_mobj:
1685 video_description = unescapeHTML(fd_mobj.group(1))
1686 else:
1687 video_description = ''
1688
1689 if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False):
1690 if not self._downloader.params.get('noplaylist'):
1691 entries = []
1692 feed_ids = []
1693 multifeed_metadata_list = video_info['multifeed_metadata_list'][0]
1694 for feed in multifeed_metadata_list.split(','):
1695 # Unquote should take place before split on comma (,) since textual
1696 # fields may contain comma as well (see
1697 # https://github.com/rg3/youtube-dl/issues/8536)
1698 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
1699 entries.append({
1700 '_type': 'url_transparent',
1701 'ie_key': 'Youtube',
1702 'url': smuggle_url(
1703 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1704 {'force_singlefeed': True}),
1705 'title': '%s (%s)' % (video_title, feed_data['title'][0]),
1706 })
1707 feed_ids.append(feed_data['id'][0])
1708 self.to_screen(
1709 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1710 % (', '.join(feed_ids), video_id))
1711 return self.playlist_result(entries, video_id, video_title, video_description)
1712 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1713
1714 if view_count is None:
1715 view_count = extract_view_count(video_info)
1716
1717 # Check for "rental" videos
1718 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1719 raise ExtractorError('"rental" videos not supported. See https://github.com/rg3/youtube-dl/issues/359 for more information.', expected=True)
1720
1721 def _extract_filesize(media_url):
1722 return int_or_none(self._search_regex(
1723 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
1724
1725 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1726 self.report_rtmp_download()
1727 formats = [{
1728 'format_id': '_rtmp',
1729 'protocol': 'rtmp',
1730 'url': video_info['conn'][0],
1731 'player_url': player_url,
1732 }]
1733 elif not is_live and (len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
1734 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
1735 if 'rtmpe%3Dyes' in encoded_url_map:
1736 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1737 formats_spec = {}
1738 fmt_list = video_info.get('fmt_list', [''])[0]
1739 if fmt_list:
1740 for fmt in fmt_list.split(','):
1741 spec = fmt.split('/')
1742 if len(spec) > 1:
1743 width_height = spec[1].split('x')
1744 if len(width_height) == 2:
1745 formats_spec[spec[0]] = {
1746 'resolution': spec[1],
1747 'width': int_or_none(width_height[0]),
1748 'height': int_or_none(width_height[1]),
1749 }
1750 q = qualities(['small', 'medium', 'hd720'])
1751 formats = []
1752 for url_data_str in encoded_url_map.split(','):
1753 url_data = compat_parse_qs(url_data_str)
1754 if 'itag' not in url_data or 'url' not in url_data:
1755 continue
1756 format_id = url_data['itag'][0]
1757 url = url_data['url'][0]
1758
1759 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
1760 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
1761 jsplayer_url_json = self._search_regex(
1762 ASSETS_RE,
1763 embed_webpage if age_gate else video_webpage,
1764 'JS player URL (1)', default=None)
1765 if not jsplayer_url_json and not age_gate:
1766 # We need the embed website after all
1767 if embed_webpage is None:
1768 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1769 embed_webpage = self._download_webpage(
1770 embed_url, video_id, 'Downloading embed webpage')
1771 jsplayer_url_json = self._search_regex(
1772 ASSETS_RE, embed_webpage, 'JS player URL')
1773
1774 player_url = json.loads(jsplayer_url_json)
1775 if player_url is None:
1776 player_url_json = self._search_regex(
1777 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
1778 video_webpage, 'age gate player URL')
1779 player_url = json.loads(player_url_json)
1780
1781 if 'sig' in url_data:
1782 url += '&signature=' + url_data['sig'][0]
1783 elif 's' in url_data:
1784 encrypted_sig = url_data['s'][0]
1785
1786 if self._downloader.params.get('verbose'):
1787 if player_url is None:
1788 player_version = 'unknown'
1789 player_desc = 'unknown'
1790 else:
1791 if player_url.endswith('swf'):
1792 player_version = self._search_regex(
1793 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
1794 'flash player', fatal=False)
1795 player_desc = 'flash player %s' % player_version
1796 else:
1797 player_version = self._search_regex(
1798 [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',
1799 r'(?:www|player)-([^/]+)(?:/[a-z]{2}_[A-Z]{2})?/base\.js'],
1800 player_url,
1801 'html5 player', fatal=False)
1802 player_desc = 'html5 player %s' % player_version
1803
1804 parts_sizes = self._signature_cache_id(encrypted_sig)
1805 self.to_screen('{%s} signature length %s, %s' %
1806 (format_id, parts_sizes, player_desc))
1807
1808 signature = self._decrypt_signature(
1809 encrypted_sig, video_id, player_url, age_gate)
1810 url += '&signature=' + signature
1811 if 'ratebypass' not in url:
1812 url += '&ratebypass=yes'
1813
1814 dct = {
1815 'format_id': format_id,
1816 'url': url,
1817 'player_url': player_url,
1818 }
1819 if format_id in self._formats:
1820 dct.update(self._formats[format_id])
1821 if format_id in formats_spec:
1822 dct.update(formats_spec[format_id])
1823
1824 # Some itags are not included in DASH manifest thus corresponding formats will
1825 # lack metadata (see https://github.com/rg3/youtube-dl/pull/5993).
1826 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
1827 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
1828 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
1829
1830 filesize = int_or_none(url_data.get(
1831 'clen', [None])[0]) or _extract_filesize(url)
1832
1833 quality = url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0]
1834
1835 more_fields = {
1836 'filesize': filesize,
1837 'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000),
1838 'width': width,
1839 'height': height,
1840 'fps': int_or_none(url_data.get('fps', [None])[0]),
1841 'format_note': quality,
1842 'quality': q(quality),
1843 }
1844 for key, value in more_fields.items():
1845 if value:
1846 dct[key] = value
1847 type_ = url_data.get('type', [None])[0]
1848 if type_:
1849 type_split = type_.split(';')
1850 kind_ext = type_split[0].split('/')
1851 if len(kind_ext) == 2:
1852 kind, _ = kind_ext
1853 dct['ext'] = mimetype2ext(type_split[0])
1854 if kind in ('audio', 'video'):
1855 codecs = None
1856 for mobj in re.finditer(
1857 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
1858 if mobj.group('key') == 'codecs':
1859 codecs = mobj.group('val')
1860 break
1861 if codecs:
1862 dct.update(parse_codecs(codecs))
1863 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
1864 dct['downloader_options'] = {
1865 # Youtube throttles chunks >~10M
1866 'http_chunk_size': 10485760,
1867 }
1868 formats.append(dct)
1869 elif video_info.get('hlsvp'):
1870 manifest_url = video_info['hlsvp'][0]
1871 formats = []
1872 m3u8_formats = self._extract_m3u8_formats(
1873 manifest_url, video_id, 'mp4', fatal=False)
1874 for a_format in m3u8_formats:
1875 itag = self._search_regex(
1876 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
1877 if itag:
1878 a_format['format_id'] = itag
1879 if itag in self._formats:
1880 dct = self._formats[itag].copy()
1881 dct.update(a_format)
1882 a_format = dct
1883 a_format['player_url'] = player_url
1884 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
1885 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
1886 formats.append(a_format)
1887 else:
1888 error_message = clean_html(video_info.get('reason', [None])[0])
1889 if not error_message:
1890 error_message = extract_unavailable_message()
1891 if error_message:
1892 raise ExtractorError(error_message, expected=True)
1893 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1894
1895 # uploader
1896 video_uploader = try_get(video_info, lambda x: x['author'][0], compat_str)
1897 if video_uploader:
1898 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
1899 else:
1900 self._downloader.report_warning('unable to extract uploader name')
1901
1902 # uploader_id
1903 video_uploader_id = None
1904 video_uploader_url = None
1905 mobj = re.search(
1906 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
1907 video_webpage)
1908 if mobj is not None:
1909 video_uploader_id = mobj.group('uploader_id')
1910 video_uploader_url = mobj.group('uploader_url')
1911 else:
1912 self._downloader.report_warning('unable to extract uploader nickname')
1913
1914 channel_id = self._html_search_meta(
1915 'channelId', video_webpage, 'channel id')
1916 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
1917
1918 # thumbnail image
1919 # We try first to get a high quality image:
1920 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1921 video_webpage, re.DOTALL)
1922 if m_thumb is not None:
1923 video_thumbnail = m_thumb.group(1)
1924 elif 'thumbnail_url' not in video_info:
1925 self._downloader.report_warning('unable to extract video thumbnail')
1926 video_thumbnail = None
1927 else: # don't panic if we can't find it
1928 video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
1929
1930 # upload date
1931 upload_date = self._html_search_meta(
1932 'datePublished', video_webpage, 'upload date', default=None)
1933 if not upload_date:
1934 upload_date = self._search_regex(
1935 [r'(?s)id="eow-date.*?>(.*?)</span>',
1936 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
1937 video_webpage, 'upload date', default=None)
1938 upload_date = unified_strdate(upload_date)
1939
1940 video_license = self._html_search_regex(
1941 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
1942 video_webpage, 'license', default=None)
1943
1944 m_music = re.search(
1945 r'''(?x)
1946 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
1947 <ul[^>]*>\s*
1948 <li>(?P<title>.+?)
1949 by (?P<creator>.+?)
1950 (?:
1951 \(.+?\)|
1952 <a[^>]*
1953 (?:
1954 \bhref=["\']/red[^>]*>| # drop possible
1955 >\s*Listen ad-free with YouTube Red # YouTube Red ad
1956 )
1957 .*?
1958 )?</li
1959 ''',
1960 video_webpage)
1961 if m_music:
1962 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
1963 video_creator = clean_html(m_music.group('creator'))
1964 else:
1965 video_alt_title = video_creator = None
1966
1967 def extract_meta(field):
1968 return self._html_search_regex(
1969 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
1970 video_webpage, field, default=None)
1971
1972 track = extract_meta('Song')
1973 artist = extract_meta('Artist')
1974
1975 m_episode = re.search(
1976 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
1977 video_webpage)
1978 if m_episode:
1979 series = m_episode.group('series')
1980 season_number = int(m_episode.group('season'))
1981 episode_number = int(m_episode.group('episode'))
1982 else:
1983 series = season_number = episode_number = None
1984
1985 m_cat_container = self._search_regex(
1986 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
1987 video_webpage, 'categories', default=None)
1988 if m_cat_container:
1989 category = self._html_search_regex(
1990 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
1991 default=None)
1992 video_categories = None if category is None else [category]
1993 else:
1994 video_categories = None
1995
1996 video_tags = [
1997 unescapeHTML(m.group('content'))
1998 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
1999
2000 def _extract_count(count_name):
2001 return str_to_int(self._search_regex(
2002 r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
2003 % re.escape(count_name),
2004 video_webpage, count_name, default=None))
2005
2006 like_count = _extract_count('like')
2007 dislike_count = _extract_count('dislike')
2008
2009 # subtitles
2010 video_subtitles = self.extract_subtitles(video_id, video_webpage)
2011 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
2012
2013 video_duration = try_get(
2014 video_info, lambda x: int_or_none(x['length_seconds'][0]))
2015 if not video_duration:
2016 video_duration = parse_duration(self._html_search_meta(
2017 'duration', video_webpage, 'video duration'))
2018
2019 # annotations
2020 video_annotations = None
2021 if self._downloader.params.get('writeannotations', False):
2022 video_annotations = self._extract_annotations(video_id)
2023
2024 chapters = self._extract_chapters(description_original, video_duration)
2025
2026 # Look for the DASH manifest
2027 if self._downloader.params.get('youtube_include_dash_manifest', True):
2028 dash_mpd_fatal = True
2029 for mpd_url in dash_mpds:
2030 dash_formats = {}
2031 try:
2032 def decrypt_sig(mobj):
2033 s = mobj.group(1)
2034 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2035 return '/signature/%s' % dec_s
2036
2037 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2038
2039 for df in self._extract_mpd_formats(
2040 mpd_url, video_id, fatal=dash_mpd_fatal,
2041 formats_dict=self._formats):
2042 if not df.get('filesize'):
2043 df['filesize'] = _extract_filesize(df['url'])
2044 # Do not overwrite DASH format found in some previous DASH manifest
2045 if df['format_id'] not in dash_formats:
2046 dash_formats[df['format_id']] = df
2047 # Additional DASH manifests may end up in HTTP Error 403 therefore
2048 # allow them to fail without bug report message if we already have
2049 # some DASH manifest succeeded. This is temporary workaround to reduce
2050 # burst of bug reports until we figure out the reason and whether it
2051 # can be fixed at all.
2052 dash_mpd_fatal = False
2053 except (ExtractorError, KeyError) as e:
2054 self.report_warning(
2055 'Skipping DASH manifest: %r' % e, video_id)
2056 if dash_formats:
2057 # Remove the formats we found through non-DASH, they
2058 # contain less info and it can be wrong, because we use
2059 # fixed values (for example the resolution). See
2060 # https://github.com/rg3/youtube-dl/issues/5774 for an
2061 # example.
2062 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
2063 formats.extend(dash_formats.values())
2064
2065 # Check for malformed aspect ratio
2066 stretched_m = re.search(
2067 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2068 video_webpage)
2069 if stretched_m:
2070 w = float(stretched_m.group('w'))
2071 h = float(stretched_m.group('h'))
2072 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2073 # We will only process correct ratios.
2074 if w > 0 and h > 0:
2075 ratio = w / h
2076 for f in formats:
2077 if f.get('vcodec') != 'none':
2078 f['stretched_ratio'] = ratio
2079
2080 self._sort_formats(formats)
2081
2082 self.mark_watched(video_id, video_info)
2083
2084 return {
2085 'id': video_id,
2086 'uploader': video_uploader,
2087 'uploader_id': video_uploader_id,
2088 'uploader_url': video_uploader_url,
2089 'channel_id': channel_id,
2090 'channel_url': channel_url,
2091 'upload_date': upload_date,
2092 'license': video_license,
2093 'creator': video_creator or artist,
2094 'title': video_title,
2095 'alt_title': video_alt_title or track,
2096 'thumbnail': video_thumbnail,
2097 'description': video_description,
2098 'categories': video_categories,
2099 'tags': video_tags,
2100 'subtitles': video_subtitles,
2101 'automatic_captions': automatic_captions,
2102 'duration': video_duration,
2103 'age_limit': 18 if age_gate else 0,
2104 'annotations': video_annotations,
2105 'chapters': chapters,
2106 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
2107 'view_count': view_count,
2108 'like_count': like_count,
2109 'dislike_count': dislike_count,
2110 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
2111 'formats': formats,
2112 'is_live': is_live,
2113 'start_time': start_time,
2114 'end_time': end_time,
2115 'series': series,
2116 'season_number': season_number,
2117 'episode_number': episode_number,
2118 'track': track,
2119 'artist': artist,
2120 }
2121
2122
2123 class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
2124 IE_DESC = 'YouTube.com playlists'
2125 _VALID_URL = r"""(?x)(?:
2126 (?:https?://)?
2127 (?:\w+\.)?
2128 (?:
2129 youtube\.com/
2130 (?:
2131 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))
2132 \? (?:.*?[&;])*? (?:p|a|list)=
2133 | p/
2134 )|
2135 youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
2136 )
2137 (
2138 (?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)?[0-9A-Za-z-_]{10,}
2139 # Top tracks, they can also include dots
2140 |(?:MC)[\w\.]*
2141 )
2142 .*
2143 |
2144 (%(playlist_id)s)
2145 )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
2146 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
2147 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'
2148 IE_NAME = 'youtube:playlist'
2149 _TESTS = [{
2150 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
2151 'info_dict': {
2152 'title': 'ytdl test PL',
2153 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
2154 },
2155 'playlist_count': 3,
2156 }, {
2157 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
2158 'info_dict': {
2159 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
2160 'title': 'YDL_Empty_List',
2161 },
2162 'playlist_count': 0,
2163 'skip': 'This playlist is private',
2164 }, {
2165 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2166 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2167 'info_dict': {
2168 'title': '29C3: Not my department',
2169 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2170 },
2171 'playlist_count': 95,
2172 }, {
2173 'note': 'issue #673',
2174 'url': 'PLBB231211A4F62143',
2175 'info_dict': {
2176 'title': '[OLD]Team Fortress 2 (Class-based LP)',
2177 'id': 'PLBB231211A4F62143',
2178 },
2179 'playlist_mincount': 26,
2180 }, {
2181 'note': 'Large playlist',
2182 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
2183 'info_dict': {
2184 'title': 'Uploads from Cauchemar',
2185 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
2186 },
2187 'playlist_mincount': 799,
2188 }, {
2189 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2190 'info_dict': {
2191 'title': 'YDL_safe_search',
2192 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2193 },
2194 'playlist_count': 2,
2195 'skip': 'This playlist is private',
2196 }, {
2197 'note': 'embedded',
2198 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
2199 'playlist_count': 4,
2200 'info_dict': {
2201 'title': 'JODA15',
2202 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
2203 }
2204 }, {
2205 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2206 'playlist_mincount': 485,
2207 'info_dict': {
2208 'title': '2017 華語最新單曲 (2/24更新)',
2209 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2210 }
2211 }, {
2212 'note': 'Embedded SWF player',
2213 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
2214 'playlist_count': 4,
2215 'info_dict': {
2216 'title': 'JODA7',
2217 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
2218 }
2219 }, {
2220 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2221 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2222 'info_dict': {
2223 'title': 'Uploads from Interstellar Movie',
2224 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
2225 },
2226 'playlist_mincount': 21,
2227 }, {
2228 # Playlist URL that does not actually serve a playlist
2229 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2230 'info_dict': {
2231 'id': 'FqZTN594JQw',
2232 'ext': 'webm',
2233 'title': "Smiley's People 01 detective, Adventure Series, Action",
2234 'uploader': 'STREEM',
2235 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
2236 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
2237 'upload_date': '20150526',
2238 'license': 'Standard YouTube License',
2239 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2240 'categories': ['People & Blogs'],
2241 'tags': list,
2242 'like_count': int,
2243 'dislike_count': int,
2244 },
2245 'params': {
2246 'skip_download': True,
2247 },
2248 'add_ie': [YoutubeIE.ie_key()],
2249 }, {
2250 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
2251 'info_dict': {
2252 'id': 'yeWKywCrFtk',
2253 'ext': 'mp4',
2254 'title': 'Small Scale Baler and Braiding Rugs',
2255 'uploader': 'Backus-Page House Museum',
2256 'uploader_id': 'backuspagemuseum',
2257 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
2258 'upload_date': '20161008',
2259 'license': 'Standard YouTube License',
2260 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
2261 'categories': ['Nonprofits & Activism'],
2262 'tags': list,
2263 'like_count': int,
2264 'dislike_count': int,
2265 },
2266 'params': {
2267 'noplaylist': True,
2268 'skip_download': True,
2269 },
2270 }, {
2271 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
2272 'only_matching': True,
2273 }, {
2274 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
2275 'only_matching': True,
2276 }, {
2277 # music album playlist
2278 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
2279 'only_matching': True,
2280 }]
2281
2282 def _real_initialize(self):
2283 self._login()
2284
2285 def _extract_mix(self, playlist_id):
2286 # The mixes are generated from a single video
2287 # the id of the playlist is just 'RD' + video_id
2288 ids = []
2289 last_id = playlist_id[-11:]
2290 for n in itertools.count(1):
2291 url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
2292 webpage = self._download_webpage(
2293 url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))
2294 new_ids = orderedSet(re.findall(
2295 r'''(?xs)data-video-username=".*?".*?
2296 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
2297 webpage))
2298 # Fetch new pages until all the videos are repeated, it seems that
2299 # there are always 51 unique videos.
2300 new_ids = [_id for _id in new_ids if _id not in ids]
2301 if not new_ids:
2302 break
2303 ids.extend(new_ids)
2304 last_id = ids[-1]
2305
2306 url_results = self._ids_to_results(ids)
2307
2308 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
2309 title_span = (
2310 search_title('playlist-title') or
2311 search_title('title long-title') or
2312 search_title('title'))
2313 title = clean_html(title_span)
2314
2315 return self.playlist_result(url_results, playlist_id, title)
2316
2317 def _extract_playlist(self, playlist_id):
2318 url = self._TEMPLATE_URL % playlist_id
2319 page = self._download_webpage(url, playlist_id)
2320
2321 # the yt-alert-message now has tabindex attribute (see https://github.com/rg3/youtube-dl/issues/11604)
2322 for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page):
2323 match = match.strip()
2324 # Check if the playlist exists or is private
2325 mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match)
2326 if mobj:
2327 reason = mobj.group('reason')
2328 message = 'This playlist %s' % reason
2329 if 'private' in reason:
2330 message += ', use --username or --netrc to access it'
2331 message += '.'
2332 raise ExtractorError(message, expected=True)
2333 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
2334 raise ExtractorError(
2335 'Invalid parameters. Maybe URL is incorrect.',
2336 expected=True)
2337 elif re.match(r'[^<]*Choose your language[^<]*', match):
2338 continue
2339 else:
2340 self.report_warning('Youtube gives an alert message: ' + match)
2341
2342 playlist_title = self._html_search_regex(
2343 r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
2344 page, 'title', default=None)
2345
2346 _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref='
2347 uploader = self._search_regex(
2348 r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,
2349 page, 'uploader', default=None)
2350 mobj = re.search(
2351 r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE,
2352 page)
2353 if mobj:
2354 uploader_id = mobj.group('uploader_id')
2355 uploader_url = compat_urlparse.urljoin(url, mobj.group('path'))
2356 else:
2357 uploader_id = uploader_url = None
2358
2359 has_videos = True
2360
2361 if not playlist_title:
2362 try:
2363 # Some playlist URLs don't actually serve a playlist (e.g.
2364 # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)
2365 next(self._entries(page, playlist_id))
2366 except StopIteration:
2367 has_videos = False
2368
2369 playlist = self.playlist_result(
2370 self._entries(page, playlist_id), playlist_id, playlist_title)
2371 playlist.update({
2372 'uploader': uploader,
2373 'uploader_id': uploader_id,
2374 'uploader_url': uploader_url,
2375 })
2376
2377 return has_videos, playlist
2378
2379 def _check_download_just_video(self, url, playlist_id):
2380 # Check if it's a video-specific URL
2381 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
2382 video_id = query_dict.get('v', [None])[0] or self._search_regex(
2383 r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url,
2384 'video id', default=None)
2385 if video_id:
2386 if self._downloader.params.get('noplaylist'):
2387 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
2388 return video_id, self.url_result(video_id, 'Youtube', video_id=video_id)
2389 else:
2390 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
2391 return video_id, None
2392 return None, None
2393
2394 def _real_extract(self, url):
2395 # Extract playlist id
2396 mobj = re.match(self._VALID_URL, url)
2397 if mobj is None:
2398 raise ExtractorError('Invalid URL: %s' % url)
2399 playlist_id = mobj.group(1) or mobj.group(2)
2400
2401 video_id, video = self._check_download_just_video(url, playlist_id)
2402 if video:
2403 return video
2404
2405 if playlist_id.startswith(('RD', 'UL', 'PU')):
2406 # Mixes require a custom extraction process
2407 return self._extract_mix(playlist_id)
2408
2409 has_videos, playlist = self._extract_playlist(playlist_id)
2410 if has_videos or not video_id:
2411 return playlist
2412
2413 # Some playlist URLs don't actually serve a playlist (see
2414 # https://github.com/rg3/youtube-dl/issues/10537).
2415 # Fallback to plain video extraction if there is a video id
2416 # along with playlist id.
2417 return self.url_result(video_id, 'Youtube', video_id=video_id)
2418
2419
2420 class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
2421 IE_DESC = 'YouTube.com channels'
2422 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
2423 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
2424 _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
2425 IE_NAME = 'youtube:channel'
2426 _TESTS = [{
2427 'note': 'paginated channel',
2428 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
2429 'playlist_mincount': 91,
2430 'info_dict': {
2431 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
2432 'title': 'Uploads from lex will',
2433 }
2434 }, {
2435 'note': 'Age restricted channel',
2436 # from https://www.youtube.com/user/DeusExOfficial
2437 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',
2438 'playlist_mincount': 64,
2439 'info_dict': {
2440 'id': 'UUs0ifCMCm1icqRbqhUINa0w',
2441 'title': 'Uploads from Deus Ex',
2442 },
2443 }]
2444
2445 @classmethod
2446 def suitable(cls, url):
2447 return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)
2448 else super(YoutubeChannelIE, cls).suitable(url))
2449
2450 def _build_template_url(self, url, channel_id):
2451 return self._TEMPLATE_URL % channel_id
2452
2453 def _real_extract(self, url):
2454 channel_id = self._match_id(url)
2455
2456 url = self._build_template_url(url, channel_id)
2457
2458 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
2459 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
2460 # otherwise fallback on channel by page extraction
2461 channel_page = self._download_webpage(
2462 url + '?view=57', channel_id,
2463 'Downloading channel page', fatal=False)
2464 if channel_page is False:
2465 channel_playlist_id = False
2466 else:
2467 channel_playlist_id = self._html_search_meta(
2468 'channelId', channel_page, 'channel id', default=None)
2469 if not channel_playlist_id:
2470 channel_url = self._html_search_meta(
2471 ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),
2472 channel_page, 'channel url', default=None)
2473 if channel_url:
2474 channel_playlist_id = self._search_regex(
2475 r'vnd\.youtube://user/([0-9A-Za-z_-]+)',
2476 channel_url, 'channel id', default=None)
2477 if channel_playlist_id and channel_playlist_id.startswith('UC'):
2478 playlist_id = 'UU' + channel_playlist_id[2:]
2479 return self.url_result(
2480 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
2481
2482 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
2483 autogenerated = re.search(r'''(?x)
2484 class="[^"]*?(?:
2485 channel-header-autogenerated-label|
2486 yt-channel-title-autogenerated
2487 )[^"]*"''', channel_page) is not None
2488
2489 if autogenerated:
2490 # The videos are contained in a single page
2491 # the ajax pages can't be used, they are empty
2492 entries = [
2493 self.url_result(
2494 video_id, 'Youtube', video_id=video_id,
2495 video_title=video_title)
2496 for video_id, video_title in self.extract_videos_from_page(channel_page)]
2497 return self.playlist_result(entries, channel_id)
2498
2499 try:
2500 next(self._entries(channel_page, channel_id))
2501 except StopIteration:
2502 alert_message = self._html_search_regex(
2503 r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',
2504 channel_page, 'alert', default=None, group='alert')
2505 if alert_message:
2506 raise ExtractorError('Youtube said: %s' % alert_message, expected=True)
2507
2508 return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
2509
2510
2511 class YoutubeUserIE(YoutubeChannelIE):
2512 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
2513 _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
2514 _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
2515 IE_NAME = 'youtube:user'
2516
2517 _TESTS = [{
2518 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
2519 'playlist_mincount': 320,
2520 'info_dict': {
2521 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
2522 'title': 'Uploads from The Linux Foundation',
2523 }
2524 }, {
2525 # Only available via https://www.youtube.com/c/12minuteathlete/videos
2526 # but not https://www.youtube.com/user/12minuteathlete/videos
2527 'url': 'https://www.youtube.com/c/12minuteathlete/videos',
2528 'playlist_mincount': 249,
2529 'info_dict': {
2530 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
2531 'title': 'Uploads from 12 Minute Athlete',
2532 }
2533 }, {
2534 'url': 'ytuser:phihag',
2535 'only_matching': True,
2536 }, {
2537 'url': 'https://www.youtube.com/c/gametrailers',
2538 'only_matching': True,
2539 }, {
2540 'url': 'https://www.youtube.com/gametrailers',
2541 'only_matching': True,
2542 }, {
2543 # This channel is not available, geo restricted to JP
2544 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',
2545 'only_matching': True,
2546 }]
2547
2548 @classmethod
2549 def suitable(cls, url):
2550 # Don't return True if the url can be extracted with other youtube
2551 # extractor, the regex would is too permissive and it would match.
2552 other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)
2553 if any(ie.suitable(url) for ie in other_yt_ies):
2554 return False
2555 else:
2556 return super(YoutubeUserIE, cls).suitable(url)
2557
2558 def _build_template_url(self, url, channel_id):
2559 mobj = re.match(self._VALID_URL, url)
2560 return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))
2561
2562
2563 class YoutubeLiveIE(YoutubeBaseInfoExtractor):
2564 IE_DESC = 'YouTube.com live streams'
2565 _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'
2566 IE_NAME = 'youtube:live'
2567
2568 _TESTS = [{
2569 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
2570 'info_dict': {
2571 'id': 'a48o2S1cPoo',
2572 'ext': 'mp4',
2573 'title': 'The Young Turks - Live Main Show',
2574 'uploader': 'The Young Turks',
2575 'uploader_id': 'TheYoungTurks',
2576 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
2577 'upload_date': '20150715',
2578 'license': 'Standard YouTube License',
2579 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
2580 'categories': ['News & Politics'],
2581 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
2582 'like_count': int,
2583 'dislike_count': int,
2584 },
2585 'params': {
2586 'skip_download': True,
2587 },
2588 }, {
2589 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
2590 'only_matching': True,
2591 }, {
2592 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
2593 'only_matching': True,
2594 }, {
2595 'url': 'https://www.youtube.com/TheYoungTurks/live',
2596 'only_matching': True,
2597 }]
2598
2599 def _real_extract(self, url):
2600 mobj = re.match(self._VALID_URL, url)
2601 channel_id = mobj.group('id')
2602 base_url = mobj.group('base_url')
2603 webpage = self._download_webpage(url, channel_id, fatal=False)
2604 if webpage:
2605 page_type = self._og_search_property(
2606 'type', webpage, 'page type', default='')
2607 video_id = self._html_search_meta(
2608 'videoId', webpage, 'video id', default=None)
2609 if page_type.startswith('video') and video_id and re.match(
2610 r'^[0-9A-Za-z_-]{11}$', video_id):
2611 return self.url_result(video_id, YoutubeIE.ie_key())
2612 return self.url_result(base_url)
2613
2614
2615 class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
2616 IE_DESC = 'YouTube.com user/channel playlists'
2617 _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists'
2618 IE_NAME = 'youtube:playlists'
2619
2620 _TESTS = [{
2621 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
2622 'playlist_mincount': 4,
2623 'info_dict': {
2624 'id': 'ThirstForScience',
2625 'title': 'Thirst for Science',
2626 },
2627 }, {
2628 # with "Load more" button
2629 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
2630 'playlist_mincount': 70,
2631 'info_dict': {
2632 'id': 'igorkle1',
2633 'title': 'Игорь Клейнер',
2634 },
2635 }, {
2636 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',
2637 'playlist_mincount': 17,
2638 'info_dict': {
2639 'id': 'UCiU1dHvZObB2iP6xkJ__Icw',
2640 'title': 'Chem Player',
2641 },
2642 }]
2643
2644
2645 class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):
2646 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'
2647
2648
2649 class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
2650 IE_DESC = 'YouTube.com searches'
2651 # there doesn't appear to be a real limit, for example if you search for
2652 # 'python' you get more than 8.000.000 results
2653 _MAX_RESULTS = float('inf')
2654 IE_NAME = 'youtube:search'
2655 _SEARCH_KEY = 'ytsearch'
2656 _EXTRA_QUERY_ARGS = {}
2657 _TESTS = []
2658
2659 def _get_n_results(self, query, n):
2660 """Get a specified number of results for a query"""
2661
2662 videos = []
2663 limit = n
2664
2665 url_query = {
2666 'search_query': query.encode('utf-8'),
2667 }
2668 url_query.update(self._EXTRA_QUERY_ARGS)
2669 result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query)
2670
2671 for pagenum in itertools.count(1):
2672 data = self._download_json(
2673 result_url, video_id='query "%s"' % query,
2674 note='Downloading page %s' % pagenum,
2675 errnote='Unable to download API page',
2676 query={'spf': 'navigate'})
2677 html_content = data[1]['body']['content']
2678
2679 if 'class="search-message' in html_content:
2680 raise ExtractorError(
2681 '[youtube] No video results', expected=True)
2682
2683 new_videos = list(self._process_page(html_content))
2684 videos += new_videos
2685 if not new_videos or len(videos) > limit:
2686 break
2687 next_link = self._html_search_regex(
2688 r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next',
2689 html_content, 'next link', default=None)
2690 if next_link is None:
2691 break
2692 result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link)
2693
2694 if len(videos) > n:
2695 videos = videos[:n]
2696 return self.playlist_result(videos, query)
2697
2698
2699 class YoutubeSearchDateIE(YoutubeSearchIE):
2700 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
2701 _SEARCH_KEY = 'ytsearchdate'
2702 IE_DESC = 'YouTube.com searches, newest videos first'
2703 _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
2704
2705
2706 class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):
2707 IE_DESC = 'YouTube.com search URLs'
2708 IE_NAME = 'youtube:search_url'
2709 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
2710 _TESTS = [{
2711 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
2712 'playlist_mincount': 5,
2713 'info_dict': {
2714 'title': 'youtube-dl test video',
2715 }
2716 }, {
2717 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
2718 'only_matching': True,
2719 }]
2720
2721 def _real_extract(self, url):
2722 mobj = re.match(self._VALID_URL, url)
2723 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
2724 webpage = self._download_webpage(url, query)
2725 return self.playlist_result(self._process_page(webpage), playlist_title=query)
2726
2727
2728 class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
2729 IE_DESC = 'YouTube.com (multi-season) shows'
2730 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
2731 IE_NAME = 'youtube:show'
2732 _TESTS = [{
2733 'url': 'https://www.youtube.com/show/airdisasters',
2734 'playlist_mincount': 5,
2735 'info_dict': {
2736 'id': 'airdisasters',
2737 'title': 'Air Disasters',
2738 }
2739 }]
2740
2741 def _real_extract(self, url):
2742 playlist_id = self._match_id(url)
2743 return super(YoutubeShowIE, self)._real_extract(
2744 'https://www.youtube.com/show/%s/playlists' % playlist_id)
2745
2746
2747 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
2748 """
2749 Base class for feed extractors
2750 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
2751 """
2752 _LOGIN_REQUIRED = True
2753
2754 @property
2755 def IE_NAME(self):
2756 return 'youtube:%s' % self._FEED_NAME
2757
2758 def _real_initialize(self):
2759 self._login()
2760
2761 def _entries(self, page):
2762 # The extraction process is the same as for playlists, but the regex
2763 # for the video ids doesn't contain an index
2764 ids = []
2765 more_widget_html = content_html = page
2766 for page_num in itertools.count(1):
2767 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
2768
2769 # 'recommended' feed has infinite 'load more' and each new portion spins
2770 # the same videos in (sometimes) slightly different order, so we'll check
2771 # for unicity and break when portion has no new videos
2772 new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches)))
2773 if not new_ids:
2774 break
2775
2776 ids.extend(new_ids)
2777
2778 for entry in self._ids_to_results(new_ids):
2779 yield entry
2780
2781 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
2782 if not mobj:
2783 break
2784
2785 more = self._download_json(
2786 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
2787 'Downloading page #%s' % page_num,
2788 transform_source=uppercase_escape)
2789 content_html = more['content_html']
2790 more_widget_html = more['load_more_widget_html']
2791
2792 def _real_extract(self, url):
2793 page = self._download_webpage(
2794 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
2795 self._PLAYLIST_TITLE)
2796 return self.playlist_result(
2797 self._entries(page), playlist_title=self._PLAYLIST_TITLE)
2798
2799
2800 class YoutubeWatchLaterIE(YoutubePlaylistIE):
2801 IE_NAME = 'youtube:watchlater'
2802 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
2803 _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'
2804
2805 _TESTS = [{
2806 'url': 'https://www.youtube.com/playlist?list=WL',
2807 'only_matching': True,
2808 }, {
2809 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',
2810 'only_matching': True,
2811 }]
2812
2813 def _real_extract(self, url):
2814 _, video = self._check_download_just_video(url, 'WL')
2815 if video:
2816 return video
2817 _, playlist = self._extract_playlist('WL')
2818 return playlist
2819
2820
2821 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
2822 IE_NAME = 'youtube:favorites'
2823 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
2824 _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
2825 _LOGIN_REQUIRED = True
2826
2827 def _real_extract(self, url):
2828 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
2829 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
2830 return self.url_result(playlist_id, 'YoutubePlaylist')
2831
2832
2833 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
2834 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
2835 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'
2836 _FEED_NAME = 'recommended'
2837 _PLAYLIST_TITLE = 'Youtube Recommended videos'
2838
2839
2840 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
2841 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
2842 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
2843 _FEED_NAME = 'subscriptions'
2844 _PLAYLIST_TITLE = 'Youtube Subscriptions'
2845
2846
2847 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
2848 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
2849 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
2850 _FEED_NAME = 'history'
2851 _PLAYLIST_TITLE = 'Youtube History'
2852
2853
2854 class YoutubeTruncatedURLIE(InfoExtractor):
2855 IE_NAME = 'youtube:truncated_url'
2856 IE_DESC = False # Do not list
2857 _VALID_URL = r'''(?x)
2858 (?:https?://)?
2859 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
2860 (?:watch\?(?:
2861 feature=[a-z_]+|
2862 annotation_id=annotation_[^&]+|
2863 x-yt-cl=[0-9]+|
2864 hl=[^&]*|
2865 t=[0-9]+
2866 )?
2867 |
2868 attribution_link\?a=[^&]+
2869 )
2870 $
2871 '''
2872
2873 _TESTS = [{
2874 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
2875 'only_matching': True,
2876 }, {
2877 'url': 'https://www.youtube.com/watch?',
2878 'only_matching': True,
2879 }, {
2880 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
2881 'only_matching': True,
2882 }, {
2883 'url': 'https://www.youtube.com/watch?feature=foo',
2884 'only_matching': True,
2885 }, {
2886 'url': 'https://www.youtube.com/watch?hl=en-GB',
2887 'only_matching': True,
2888 }, {
2889 'url': 'https://www.youtube.com/watch?t=2372',
2890 'only_matching': True,
2891 }]
2892
2893 def _real_extract(self, url):
2894 raise ExtractorError(
2895 'Did you forget to quote the URL? Remember that & is a meta '
2896 'character in most shells, so you want to put the URL in quotes, '
2897 'like youtube-dl '
2898 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
2899 ' or simply youtube-dl BaW_jenozKc .',
2900 expected=True)
2901
2902
2903 class YoutubeTruncatedIDIE(InfoExtractor):
2904 IE_NAME = 'youtube:truncated_id'
2905 IE_DESC = False # Do not list
2906 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
2907
2908 _TESTS = [{
2909 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
2910 'only_matching': True,
2911 }]
2912
2913 def _real_extract(self, url):
2914 video_id = self._match_id(url)
2915 raise ExtractorError(
2916 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
2917 expected=True)