]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/youtube.py
[youtube] Extract channel meta fields (closes #9676, closes #12939)
[yt-dlp.git] / youtube_dl / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5
6 import itertools
7 import json
8 import os.path
9 import random
10 import re
11 import time
12 import traceback
13
14 from .common import InfoExtractor, SearchInfoExtractor
15 from ..jsinterp import JSInterpreter
16 from ..swfinterp import SWFInterpreter
17 from ..compat import (
18 compat_chr,
19 compat_kwargs,
20 compat_parse_qs,
21 compat_urllib_parse_unquote,
22 compat_urllib_parse_unquote_plus,
23 compat_urllib_parse_urlencode,
24 compat_urllib_parse_urlparse,
25 compat_urlparse,
26 compat_str,
27 )
28 from ..utils import (
29 clean_html,
30 error_to_compat_str,
31 ExtractorError,
32 float_or_none,
33 get_element_by_attribute,
34 get_element_by_id,
35 int_or_none,
36 mimetype2ext,
37 orderedSet,
38 parse_codecs,
39 parse_duration,
40 qualities,
41 remove_quotes,
42 remove_start,
43 smuggle_url,
44 str_to_int,
45 try_get,
46 unescapeHTML,
47 unified_strdate,
48 unsmuggle_url,
49 uppercase_escape,
50 urlencode_postdata,
51 )
52
53
54 class YoutubeBaseInfoExtractor(InfoExtractor):
55 """Provide base functions for Youtube extractors"""
56 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
57 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
58
59 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
60 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
61 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
62
63 _NETRC_MACHINE = 'youtube'
64 # If True it will raise an error if no login info is provided
65 _LOGIN_REQUIRED = False
66
67 _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)[0-9A-Za-z-_]{10,}'
68
69 def _set_language(self):
70 self._set_cookie(
71 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
72 # YouTube sets the expire time to about two months
73 expire_time=time.time() + 2 * 30 * 24 * 3600)
74
75 def _ids_to_results(self, ids):
76 return [
77 self.url_result(vid_id, 'Youtube', video_id=vid_id)
78 for vid_id in ids]
79
80 def _login(self):
81 """
82 Attempt to log in to YouTube.
83 True is returned if successful or skipped.
84 False is returned if login failed.
85
86 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
87 """
88 username, password = self._get_login_info()
89 # No authentication to be performed
90 if username is None:
91 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
92 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
93 return True
94
95 login_page = self._download_webpage(
96 self._LOGIN_URL, None,
97 note='Downloading login page',
98 errnote='unable to fetch login page', fatal=False)
99 if login_page is False:
100 return
101
102 login_form = self._hidden_inputs(login_page)
103
104 def req(url, f_req, note, errnote):
105 data = login_form.copy()
106 data.update({
107 'pstMsg': 1,
108 'checkConnection': 'youtube',
109 'checkedDomains': 'youtube',
110 'hl': 'en',
111 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
112 'f.req': json.dumps(f_req),
113 'flowName': 'GlifWebSignIn',
114 'flowEntry': 'ServiceLogin',
115 })
116 return self._download_json(
117 url, None, note=note, errnote=errnote,
118 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
119 fatal=False,
120 data=urlencode_postdata(data), headers={
121 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
122 'Google-Accounts-XSRF': 1,
123 })
124
125 def warn(message):
126 self._downloader.report_warning(message)
127
128 lookup_req = [
129 username,
130 None, [], None, 'US', None, None, 2, False, True,
131 [
132 None, None,
133 [2, 1, None, 1,
134 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
135 None, [], 4],
136 1, [None, None, []], None, None, None, True
137 ],
138 username,
139 ]
140
141 lookup_results = req(
142 self._LOOKUP_URL, lookup_req,
143 'Looking up account info', 'Unable to look up account info')
144
145 if lookup_results is False:
146 return False
147
148 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
149 if not user_hash:
150 warn('Unable to extract user hash')
151 return False
152
153 challenge_req = [
154 user_hash,
155 None, 1, None, [1, None, None, None, [password, None, True]],
156 [
157 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
158 1, [None, None, []], None, None, None, True
159 ]]
160
161 challenge_results = req(
162 self._CHALLENGE_URL, challenge_req,
163 'Logging in', 'Unable to log in')
164
165 if challenge_results is False:
166 return
167
168 login_res = try_get(challenge_results, lambda x: x[0][5], list)
169 if login_res:
170 login_msg = try_get(login_res, lambda x: x[5], compat_str)
171 warn(
172 'Unable to login: %s' % 'Invalid password'
173 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
174 return False
175
176 res = try_get(challenge_results, lambda x: x[0][-1], list)
177 if not res:
178 warn('Unable to extract result entry')
179 return False
180
181 login_challenge = try_get(res, lambda x: x[0][0], list)
182 if login_challenge:
183 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
184 if challenge_str == 'TWO_STEP_VERIFICATION':
185 # SEND_SUCCESS - TFA code has been successfully sent to phone
186 # QUOTA_EXCEEDED - reached the limit of TFA codes
187 status = try_get(login_challenge, lambda x: x[5], compat_str)
188 if status == 'QUOTA_EXCEEDED':
189 warn('Exceeded the limit of TFA codes, try later')
190 return False
191
192 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
193 if not tl:
194 warn('Unable to extract TL')
195 return False
196
197 tfa_code = self._get_tfa_info('2-step verification code')
198
199 if not tfa_code:
200 warn(
201 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
202 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
203 return False
204
205 tfa_code = remove_start(tfa_code, 'G-')
206
207 tfa_req = [
208 user_hash, None, 2, None,
209 [
210 9, None, None, None, None, None, None, None,
211 [None, tfa_code, True, 2]
212 ]]
213
214 tfa_results = req(
215 self._TFA_URL.format(tl), tfa_req,
216 'Submitting TFA code', 'Unable to submit TFA code')
217
218 if tfa_results is False:
219 return False
220
221 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
222 if tfa_res:
223 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
224 warn(
225 'Unable to finish TFA: %s' % 'Invalid TFA code'
226 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
227 return False
228
229 check_cookie_url = try_get(
230 tfa_results, lambda x: x[0][-1][2], compat_str)
231 else:
232 CHALLENGES = {
233 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
234 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
235 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
236 }
237 challenge = CHALLENGES.get(
238 challenge_str,
239 '%s returned error %s.' % (self.IE_NAME, challenge_str))
240 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
241 return False
242 else:
243 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
244
245 if not check_cookie_url:
246 warn('Unable to extract CheckCookie URL')
247 return False
248
249 check_cookie_results = self._download_webpage(
250 check_cookie_url, None, 'Checking cookie', fatal=False)
251
252 if check_cookie_results is False:
253 return False
254
255 if 'https://myaccount.google.com/' not in check_cookie_results:
256 warn('Unable to log in')
257 return False
258
259 return True
260
261 def _download_webpage_handle(self, *args, **kwargs):
262 kwargs.setdefault('query', {})['disable_polymer'] = 'true'
263 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
264 *args, **compat_kwargs(kwargs))
265
266 def _real_initialize(self):
267 if self._downloader is None:
268 return
269 self._set_language()
270 if not self._login():
271 return
272
273
274 class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
275 # Extract entries from page with "Load more" button
276 def _entries(self, page, playlist_id):
277 more_widget_html = content_html = page
278 for page_num in itertools.count(1):
279 for entry in self._process_page(content_html):
280 yield entry
281
282 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
283 if not mobj:
284 break
285
286 more = self._download_json(
287 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
288 'Downloading page #%s' % page_num,
289 transform_source=uppercase_escape)
290 content_html = more['content_html']
291 if not content_html.strip():
292 # Some webpages show a "Load more" button but they don't
293 # have more videos
294 break
295 more_widget_html = more['load_more_widget_html']
296
297
298 class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
299 def _process_page(self, content):
300 for video_id, video_title in self.extract_videos_from_page(content):
301 yield self.url_result(video_id, 'Youtube', video_id, video_title)
302
303 def extract_videos_from_page(self, page):
304 ids_in_page = []
305 titles_in_page = []
306 for mobj in re.finditer(self._VIDEO_RE, page):
307 # The link with index 0 is not the first video of the playlist (not sure if still actual)
308 if 'index' in mobj.groupdict() and mobj.group('id') == '0':
309 continue
310 video_id = mobj.group('id')
311 video_title = unescapeHTML(mobj.group('title'))
312 if video_title:
313 video_title = video_title.strip()
314 try:
315 idx = ids_in_page.index(video_id)
316 if video_title and not titles_in_page[idx]:
317 titles_in_page[idx] = video_title
318 except ValueError:
319 ids_in_page.append(video_id)
320 titles_in_page.append(video_title)
321 return zip(ids_in_page, titles_in_page)
322
323
324 class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
325 def _process_page(self, content):
326 for playlist_id in orderedSet(re.findall(
327 r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',
328 content)):
329 yield self.url_result(
330 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
331
332 def _real_extract(self, url):
333 playlist_id = self._match_id(url)
334 webpage = self._download_webpage(url, playlist_id)
335 title = self._og_search_title(webpage, fatal=False)
336 return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
337
338
339 class YoutubeIE(YoutubeBaseInfoExtractor):
340 IE_DESC = 'YouTube.com'
341 _VALID_URL = r"""(?x)^
342 (
343 (?:https?://|//) # http(s):// or protocol-independent URL
344 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
345 (?:www\.)?deturl\.com/www\.youtube\.com/|
346 (?:www\.)?pwnyoutube\.com/|
347 (?:www\.)?hooktube\.com/|
348 (?:www\.)?yourepeat\.com/|
349 tube\.majestyc\.net/|
350 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
351 (?:.*?\#/)? # handle anchor (#/) redirect urls
352 (?: # the various things that can precede the ID:
353 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
354 |(?: # or the v= param in all its forms
355 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
356 (?:\?|\#!?) # the params delimiter ? or # or #!
357 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
358 v=
359 )
360 ))
361 |(?:
362 youtu\.be| # just youtu.be/xxxx
363 vid\.plus| # or vid.plus/xxxx
364 zwearz\.com/watch| # or zwearz.com/watch/xxxx
365 )/
366 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
367 )
368 )? # all until now is optional -> you can pass the naked ID
369 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
370 (?!.*?\blist=
371 (?:
372 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
373 WL # WL are handled by the watch later IE
374 )
375 )
376 (?(1).+)? # if we found the ID, everything can follow
377 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
378 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
379 _formats = {
380 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
381 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
382 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
383 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
384 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
385 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
386 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
387 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
388 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
389 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
390 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
391 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
392 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
393 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
394 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
395 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
396 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
397 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
398
399
400 # 3D videos
401 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
402 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
403 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
404 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
405 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
406 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
407 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
408
409 # Apple HTTP Live Streaming
410 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
411 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
412 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
413 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
414 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
415 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
416 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
417 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
418
419 # DASH mp4 video
420 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
421 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
422 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
423 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
424 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
425 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
426 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
427 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
428 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
429 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
430 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
431 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
432
433 # Dash mp4 audio
434 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
435 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
436 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
437 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
438 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
439 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
440 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
441
442 # Dash webm
443 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
444 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
445 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
446 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
447 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
448 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
449 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
450 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
451 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
452 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
453 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
454 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
455 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
456 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
457 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
458 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
459 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
460 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
461 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
462 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
463 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
464 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
465
466 # Dash webm audio
467 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
468 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
469
470 # Dash webm audio with opus inside
471 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
472 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
473 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
474
475 # RTMP (unnamed)
476 '_rtmp': {'protocol': 'rtmp'},
477 }
478 _SUBTITLE_FORMATS = ('ttml', 'vtt')
479
480 _GEO_BYPASS = False
481
482 IE_NAME = 'youtube'
483 _TESTS = [
484 {
485 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
486 'info_dict': {
487 'id': 'BaW_jenozKc',
488 'ext': 'mp4',
489 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
490 'uploader': 'Philipp Hagemeister',
491 'uploader_id': 'phihag',
492 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
493 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
494 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
495 'upload_date': '20121002',
496 'license': 'Standard YouTube License',
497 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
498 'categories': ['Science & Technology'],
499 'tags': ['youtube-dl'],
500 'duration': 10,
501 'like_count': int,
502 'dislike_count': int,
503 'start_time': 1,
504 'end_time': 9,
505 }
506 },
507 {
508 'url': 'https://www.youtube.com/watch?v=UxxajLWwzqY',
509 'note': 'Test generic use_cipher_signature video (#897)',
510 'info_dict': {
511 'id': 'UxxajLWwzqY',
512 'ext': 'mp4',
513 'upload_date': '20120506',
514 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
515 'alt_title': 'I Love It (feat. Charli XCX)',
516 'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8',
517 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
518 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
519 'iconic ep', 'iconic', 'love', 'it'],
520 'duration': 180,
521 'uploader': 'Icona Pop',
522 'uploader_id': 'IconaPop',
523 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop',
524 'license': 'Standard YouTube License',
525 'creator': 'Icona Pop',
526 'track': 'I Love It (feat. Charli XCX)',
527 'artist': 'Icona Pop',
528 }
529 },
530 {
531 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
532 'note': 'Test VEVO video with age protection (#956)',
533 'info_dict': {
534 'id': '07FYdnEawAQ',
535 'ext': 'mp4',
536 'upload_date': '20130703',
537 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
538 'alt_title': 'Tunnel Vision',
539 'description': 'md5:64249768eec3bc4276236606ea996373',
540 'duration': 419,
541 'uploader': 'justintimberlakeVEVO',
542 'uploader_id': 'justintimberlakeVEVO',
543 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',
544 'license': 'Standard YouTube License',
545 'creator': 'Justin Timberlake',
546 'track': 'Tunnel Vision',
547 'artist': 'Justin Timberlake',
548 'age_limit': 18,
549 }
550 },
551 {
552 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
553 'note': 'Embed-only video (#1746)',
554 'info_dict': {
555 'id': 'yZIXLfi8CZQ',
556 'ext': 'mp4',
557 'upload_date': '20120608',
558 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
559 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
560 'uploader': 'SET India',
561 'uploader_id': 'setindia',
562 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
563 'license': 'Standard YouTube License',
564 'age_limit': 18,
565 }
566 },
567 {
568 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
569 'note': 'Use the first video ID in the URL',
570 'info_dict': {
571 'id': 'BaW_jenozKc',
572 'ext': 'mp4',
573 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
574 'uploader': 'Philipp Hagemeister',
575 'uploader_id': 'phihag',
576 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
577 'upload_date': '20121002',
578 'license': 'Standard YouTube License',
579 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
580 'categories': ['Science & Technology'],
581 'tags': ['youtube-dl'],
582 'duration': 10,
583 'like_count': int,
584 'dislike_count': int,
585 },
586 'params': {
587 'skip_download': True,
588 },
589 },
590 {
591 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
592 'note': '256k DASH audio (format 141) via DASH manifest',
593 'info_dict': {
594 'id': 'a9LDPn-MO4I',
595 'ext': 'm4a',
596 'upload_date': '20121002',
597 'uploader_id': '8KVIDEO',
598 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
599 'description': '',
600 'uploader': '8KVIDEO',
601 'license': 'Standard YouTube License',
602 'title': 'UHDTV TEST 8K VIDEO.mp4'
603 },
604 'params': {
605 'youtube_include_dash_manifest': True,
606 'format': '141',
607 },
608 'skip': 'format 141 not served anymore',
609 },
610 # DASH manifest with encrypted signature
611 {
612 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
613 'info_dict': {
614 'id': 'IB3lcPjvWLA',
615 'ext': 'm4a',
616 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
617 'description': 'md5:1900ed86ee514927b9e00fbead6969a5',
618 'duration': 244,
619 'uploader': 'AfrojackVEVO',
620 'uploader_id': 'AfrojackVEVO',
621 'upload_date': '20131011',
622 'license': 'Standard YouTube License',
623 },
624 'params': {
625 'youtube_include_dash_manifest': True,
626 'format': '141/bestaudio[ext=m4a]',
627 },
628 },
629 # JS player signature function name containing $
630 {
631 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
632 'info_dict': {
633 'id': 'nfWlot6h_JM',
634 'ext': 'm4a',
635 'title': 'Taylor Swift - Shake It Off',
636 'alt_title': 'Shake It Off',
637 'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3',
638 'duration': 242,
639 'uploader': 'TaylorSwiftVEVO',
640 'uploader_id': 'TaylorSwiftVEVO',
641 'upload_date': '20140818',
642 'license': 'Standard YouTube License',
643 'creator': 'Taylor Swift',
644 },
645 'params': {
646 'youtube_include_dash_manifest': True,
647 'format': '141/bestaudio[ext=m4a]',
648 },
649 },
650 # Controversy video
651 {
652 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
653 'info_dict': {
654 'id': 'T4XJQO3qol8',
655 'ext': 'mp4',
656 'duration': 219,
657 'upload_date': '20100909',
658 'uploader': 'TJ Kirk',
659 'uploader_id': 'TheAmazingAtheist',
660 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
661 'license': 'Standard YouTube License',
662 'title': 'Burning Everyone\'s Koran',
663 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
664 }
665 },
666 # Normal age-gate video (No vevo, embed allowed)
667 {
668 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
669 'info_dict': {
670 'id': 'HtVdAasjOgU',
671 'ext': 'mp4',
672 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
673 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
674 'duration': 142,
675 'uploader': 'The Witcher',
676 'uploader_id': 'WitcherGame',
677 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
678 'upload_date': '20140605',
679 'license': 'Standard YouTube License',
680 'age_limit': 18,
681 },
682 },
683 # Age-gate video with encrypted signature
684 {
685 'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU',
686 'info_dict': {
687 'id': '6kLq3WMV1nU',
688 'ext': 'webm',
689 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
690 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
691 'duration': 246,
692 'uploader': 'LloydVEVO',
693 'uploader_id': 'LloydVEVO',
694 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO',
695 'upload_date': '20110629',
696 'license': 'Standard YouTube License',
697 'age_limit': 18,
698 },
699 },
700 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
701 # YouTube Red ad is not captured for creator
702 {
703 'url': '__2ABJjxzNo',
704 'info_dict': {
705 'id': '__2ABJjxzNo',
706 'ext': 'mp4',
707 'duration': 266,
708 'upload_date': '20100430',
709 'uploader_id': 'deadmau5',
710 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
711 'creator': 'deadmau5',
712 'description': 'md5:12c56784b8032162bb936a5f76d55360',
713 'uploader': 'deadmau5',
714 'license': 'Standard YouTube License',
715 'title': 'Deadmau5 - Some Chords (HD)',
716 'alt_title': 'Some Chords',
717 },
718 'expected_warnings': [
719 'DASH manifest missing',
720 ]
721 },
722 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
723 {
724 'url': 'lqQg6PlCWgI',
725 'info_dict': {
726 'id': 'lqQg6PlCWgI',
727 'ext': 'mp4',
728 'duration': 6085,
729 'upload_date': '20150827',
730 'uploader_id': 'olympic',
731 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
732 'license': 'Standard YouTube License',
733 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
734 'uploader': 'Olympic',
735 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
736 },
737 'params': {
738 'skip_download': 'requires avconv',
739 }
740 },
741 # Non-square pixels
742 {
743 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
744 'info_dict': {
745 'id': '_b-2C3KPAM0',
746 'ext': 'mp4',
747 'stretched_ratio': 16 / 9.,
748 'duration': 85,
749 'upload_date': '20110310',
750 'uploader_id': 'AllenMeow',
751 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
752 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
753 'uploader': '孫ᄋᄅ',
754 'license': 'Standard YouTube License',
755 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
756 },
757 },
758 # url_encoded_fmt_stream_map is empty string
759 {
760 'url': 'qEJwOuvDf7I',
761 'info_dict': {
762 'id': 'qEJwOuvDf7I',
763 'ext': 'webm',
764 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
765 'description': '',
766 'upload_date': '20150404',
767 'uploader_id': 'spbelect',
768 'uploader': 'Наблюдатели Петербурга',
769 },
770 'params': {
771 'skip_download': 'requires avconv',
772 },
773 'skip': 'This live event has ended.',
774 },
775 # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)
776 {
777 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
778 'info_dict': {
779 'id': 'FIl7x6_3R5Y',
780 'ext': 'webm',
781 'title': 'md5:7b81415841e02ecd4313668cde88737a',
782 'description': 'md5:116377fd2963b81ec4ce64b542173306',
783 'duration': 220,
784 'upload_date': '20150625',
785 'uploader_id': 'dorappi2000',
786 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
787 'uploader': 'dorappi2000',
788 'license': 'Standard YouTube License',
789 'formats': 'mincount:31',
790 },
791 'skip': 'not actual anymore',
792 },
793 # DASH manifest with segment_list
794 {
795 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
796 'md5': '8ce563a1d667b599d21064e982ab9e31',
797 'info_dict': {
798 'id': 'CsmdDsKjzN8',
799 'ext': 'mp4',
800 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
801 'uploader': 'Airtek',
802 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
803 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
804 'license': 'Standard YouTube License',
805 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
806 },
807 'params': {
808 'youtube_include_dash_manifest': True,
809 'format': '135', # bestvideo
810 },
811 'skip': 'This live event has ended.',
812 },
813 {
814 # Multifeed videos (multiple cameras), URL is for Main Camera
815 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
816 'info_dict': {
817 'id': 'jqWvoWXjCVs',
818 'title': 'teamPGP: Rocket League Noob Stream',
819 'description': 'md5:dc7872fb300e143831327f1bae3af010',
820 },
821 'playlist': [{
822 'info_dict': {
823 'id': 'jqWvoWXjCVs',
824 'ext': 'mp4',
825 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
826 'description': 'md5:dc7872fb300e143831327f1bae3af010',
827 'duration': 7335,
828 'upload_date': '20150721',
829 'uploader': 'Beer Games Beer',
830 'uploader_id': 'beergamesbeer',
831 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
832 'license': 'Standard YouTube License',
833 },
834 }, {
835 'info_dict': {
836 'id': '6h8e8xoXJzg',
837 'ext': 'mp4',
838 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
839 'description': 'md5:dc7872fb300e143831327f1bae3af010',
840 'duration': 7337,
841 'upload_date': '20150721',
842 'uploader': 'Beer Games Beer',
843 'uploader_id': 'beergamesbeer',
844 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
845 'license': 'Standard YouTube License',
846 },
847 }, {
848 'info_dict': {
849 'id': 'PUOgX5z9xZw',
850 'ext': 'mp4',
851 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
852 'description': 'md5:dc7872fb300e143831327f1bae3af010',
853 'duration': 7337,
854 'upload_date': '20150721',
855 'uploader': 'Beer Games Beer',
856 'uploader_id': 'beergamesbeer',
857 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
858 'license': 'Standard YouTube License',
859 },
860 }, {
861 'info_dict': {
862 'id': 'teuwxikvS5k',
863 'ext': 'mp4',
864 'title': 'teamPGP: Rocket League Noob Stream (zim)',
865 'description': 'md5:dc7872fb300e143831327f1bae3af010',
866 'duration': 7334,
867 'upload_date': '20150721',
868 'uploader': 'Beer Games Beer',
869 'uploader_id': 'beergamesbeer',
870 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
871 'license': 'Standard YouTube License',
872 },
873 }],
874 'params': {
875 'skip_download': True,
876 },
877 },
878 {
879 # Multifeed video with comma in title (see https://github.com/rg3/youtube-dl/issues/8536)
880 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
881 'info_dict': {
882 'id': 'gVfLd0zydlo',
883 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
884 },
885 'playlist_count': 2,
886 'skip': 'Not multifeed anymore',
887 },
888 {
889 'url': 'https://vid.plus/FlRa-iH7PGw',
890 'only_matching': True,
891 },
892 {
893 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
894 'only_matching': True,
895 },
896 {
897 # Title with JS-like syntax "};" (see https://github.com/rg3/youtube-dl/issues/7468)
898 # Also tests cut-off URL expansion in video description (see
899 # https://github.com/rg3/youtube-dl/issues/1892,
900 # https://github.com/rg3/youtube-dl/issues/8164)
901 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
902 'info_dict': {
903 'id': 'lsguqyKfVQg',
904 'ext': 'mp4',
905 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
906 'alt_title': 'Dark Walk - Position Music',
907 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
908 'duration': 133,
909 'upload_date': '20151119',
910 'uploader_id': 'IronSoulElf',
911 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
912 'uploader': 'IronSoulElf',
913 'license': 'Standard YouTube License',
914 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
915 'track': 'Dark Walk - Position Music',
916 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
917 },
918 'params': {
919 'skip_download': True,
920 },
921 },
922 {
923 # Tags with '};' (see https://github.com/rg3/youtube-dl/issues/7468)
924 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
925 'only_matching': True,
926 },
927 {
928 # Video with yt:stretch=17:0
929 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
930 'info_dict': {
931 'id': 'Q39EVAstoRM',
932 'ext': 'mp4',
933 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
934 'description': 'md5:ee18a25c350637c8faff806845bddee9',
935 'upload_date': '20151107',
936 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
937 'uploader': 'CH GAMER DROID',
938 },
939 'params': {
940 'skip_download': True,
941 },
942 'skip': 'This video does not exist.',
943 },
944 {
945 # Video licensed under Creative Commons
946 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
947 'info_dict': {
948 'id': 'M4gD1WSo5mA',
949 'ext': 'mp4',
950 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
951 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
952 'duration': 721,
953 'upload_date': '20150127',
954 'uploader_id': 'BerkmanCenter',
955 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
956 'uploader': 'The Berkman Klein Center for Internet & Society',
957 'license': 'Creative Commons Attribution license (reuse allowed)',
958 },
959 'params': {
960 'skip_download': True,
961 },
962 },
963 {
964 # Channel-like uploader_url
965 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
966 'info_dict': {
967 'id': 'eQcmzGIKrzg',
968 'ext': 'mp4',
969 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
970 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
971 'duration': 4060,
972 'upload_date': '20151119',
973 'uploader': 'Bernie Sanders',
974 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
975 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
976 'license': 'Creative Commons Attribution license (reuse allowed)',
977 },
978 'params': {
979 'skip_download': True,
980 },
981 },
982 {
983 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
984 'only_matching': True,
985 },
986 {
987 # YouTube Red paid video (https://github.com/rg3/youtube-dl/issues/10059)
988 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
989 'only_matching': True,
990 },
991 {
992 # Rental video preview
993 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
994 'info_dict': {
995 'id': 'uGpuVWrhIzE',
996 'ext': 'mp4',
997 'title': 'Piku - Trailer',
998 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
999 'upload_date': '20150811',
1000 'uploader': 'FlixMatrix',
1001 'uploader_id': 'FlixMatrixKaravan',
1002 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
1003 'license': 'Standard YouTube License',
1004 },
1005 'params': {
1006 'skip_download': True,
1007 },
1008 'skip': 'This video is not available.',
1009 },
1010 {
1011 # YouTube Red video with episode data
1012 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
1013 'info_dict': {
1014 'id': 'iqKdEhx-dD4',
1015 'ext': 'mp4',
1016 'title': 'Isolation - Mind Field (Ep 1)',
1017 'description': 'md5:25b78d2f64ae81719f5c96319889b736',
1018 'duration': 2085,
1019 'upload_date': '20170118',
1020 'uploader': 'Vsauce',
1021 'uploader_id': 'Vsauce',
1022 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
1023 'license': 'Standard YouTube License',
1024 'series': 'Mind Field',
1025 'season_number': 1,
1026 'episode_number': 1,
1027 },
1028 'params': {
1029 'skip_download': True,
1030 },
1031 'expected_warnings': [
1032 'Skipping DASH manifest',
1033 ],
1034 },
1035 {
1036 # The following content has been identified by the YouTube community
1037 # as inappropriate or offensive to some audiences.
1038 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
1039 'info_dict': {
1040 'id': '6SJNVb0GnPI',
1041 'ext': 'mp4',
1042 'title': 'Race Differences in Intelligence',
1043 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
1044 'duration': 965,
1045 'upload_date': '20140124',
1046 'uploader': 'New Century Foundation',
1047 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
1048 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
1049 'license': 'Standard YouTube License',
1050 },
1051 'params': {
1052 'skip_download': True,
1053 },
1054 },
1055 {
1056 # itag 212
1057 'url': '1t24XAntNCY',
1058 'only_matching': True,
1059 },
1060 {
1061 # geo restricted to JP
1062 'url': 'sJL6WA-aGkQ',
1063 'only_matching': True,
1064 },
1065 {
1066 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
1067 'only_matching': True,
1068 },
1069 ]
1070
1071 def __init__(self, *args, **kwargs):
1072 super(YoutubeIE, self).__init__(*args, **kwargs)
1073 self._player_cache = {}
1074
1075 def report_video_info_webpage_download(self, video_id):
1076 """Report attempt to download video info webpage."""
1077 self.to_screen('%s: Downloading video info webpage' % video_id)
1078
1079 def report_information_extraction(self, video_id):
1080 """Report attempt to extract video information."""
1081 self.to_screen('%s: Extracting video information' % video_id)
1082
1083 def report_unavailable_format(self, video_id, format):
1084 """Report extracted video URL."""
1085 self.to_screen('%s: Format %s not available' % (video_id, format))
1086
1087 def report_rtmp_download(self):
1088 """Indicate the download will use the RTMP protocol."""
1089 self.to_screen('RTMP download detected')
1090
1091 def _signature_cache_id(self, example_sig):
1092 """ Return a string representation of a signature """
1093 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
1094
1095 def _extract_signature_function(self, video_id, player_url, example_sig):
1096 id_m = re.match(
1097 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$',
1098 player_url)
1099 if not id_m:
1100 raise ExtractorError('Cannot identify player %r' % player_url)
1101 player_type = id_m.group('ext')
1102 player_id = id_m.group('id')
1103
1104 # Read from filesystem cache
1105 func_id = '%s_%s_%s' % (
1106 player_type, player_id, self._signature_cache_id(example_sig))
1107 assert os.path.basename(func_id) == func_id
1108
1109 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
1110 if cache_spec is not None:
1111 return lambda s: ''.join(s[i] for i in cache_spec)
1112
1113 download_note = (
1114 'Downloading player %s' % player_url
1115 if self._downloader.params.get('verbose') else
1116 'Downloading %s player %s' % (player_type, player_id)
1117 )
1118 if player_type == 'js':
1119 code = self._download_webpage(
1120 player_url, video_id,
1121 note=download_note,
1122 errnote='Download of %s failed' % player_url)
1123 res = self._parse_sig_js(code)
1124 elif player_type == 'swf':
1125 urlh = self._request_webpage(
1126 player_url, video_id,
1127 note=download_note,
1128 errnote='Download of %s failed' % player_url)
1129 code = urlh.read()
1130 res = self._parse_sig_swf(code)
1131 else:
1132 assert False, 'Invalid player type %r' % player_type
1133
1134 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1135 cache_res = res(test_string)
1136 cache_spec = [ord(c) for c in cache_res]
1137
1138 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
1139 return res
1140
1141 def _print_sig_code(self, func, example_sig):
1142 def gen_sig_code(idxs):
1143 def _genslice(start, end, step):
1144 starts = '' if start == 0 else str(start)
1145 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
1146 steps = '' if step == 1 else (':%d' % step)
1147 return 's[%s%s%s]' % (starts, ends, steps)
1148
1149 step = None
1150 # Quelch pyflakes warnings - start will be set when step is set
1151 start = '(Never used)'
1152 for i, prev in zip(idxs[1:], idxs[:-1]):
1153 if step is not None:
1154 if i - prev == step:
1155 continue
1156 yield _genslice(start, prev, step)
1157 step = None
1158 continue
1159 if i - prev in [-1, 1]:
1160 step = i - prev
1161 start = prev
1162 continue
1163 else:
1164 yield 's[%d]' % prev
1165 if step is None:
1166 yield 's[%d]' % i
1167 else:
1168 yield _genslice(start, i, step)
1169
1170 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1171 cache_res = func(test_string)
1172 cache_spec = [ord(c) for c in cache_res]
1173 expr_code = ' + '.join(gen_sig_code(cache_spec))
1174 signature_id_tuple = '(%s)' % (
1175 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
1176 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
1177 ' return %s\n') % (signature_id_tuple, expr_code)
1178 self.to_screen('Extracted signature function:\n' + code)
1179
1180 def _parse_sig_js(self, jscode):
1181 funcname = self._search_regex(
1182 (r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1183 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
1184 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1185 r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\('),
1186 jscode, 'Initial JS player signature function name', group='sig')
1187
1188 jsi = JSInterpreter(jscode)
1189 initial_function = jsi.extract_function(funcname)
1190 return lambda s: initial_function([s])
1191
1192 def _parse_sig_swf(self, file_contents):
1193 swfi = SWFInterpreter(file_contents)
1194 TARGET_CLASSNAME = 'SignatureDecipher'
1195 searched_class = swfi.extract_class(TARGET_CLASSNAME)
1196 initial_function = swfi.extract_function(searched_class, 'decipher')
1197 return lambda s: initial_function([s])
1198
1199 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1200 """Turn the encrypted s field into a working signature"""
1201
1202 if player_url is None:
1203 raise ExtractorError('Cannot decrypt signature without player_url')
1204
1205 if player_url.startswith('//'):
1206 player_url = 'https:' + player_url
1207 elif not re.match(r'https?://', player_url):
1208 player_url = compat_urlparse.urljoin(
1209 'https://www.youtube.com', player_url)
1210 try:
1211 player_id = (player_url, self._signature_cache_id(s))
1212 if player_id not in self._player_cache:
1213 func = self._extract_signature_function(
1214 video_id, player_url, s
1215 )
1216 self._player_cache[player_id] = func
1217 func = self._player_cache[player_id]
1218 if self._downloader.params.get('youtube_print_sig_code'):
1219 self._print_sig_code(func, s)
1220 return func(s)
1221 except Exception as e:
1222 tb = traceback.format_exc()
1223 raise ExtractorError(
1224 'Signature extraction failed: ' + tb, cause=e)
1225
1226 def _get_subtitles(self, video_id, webpage):
1227 try:
1228 subs_doc = self._download_xml(
1229 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1230 video_id, note=False)
1231 except ExtractorError as err:
1232 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
1233 return {}
1234
1235 sub_lang_list = {}
1236 for track in subs_doc.findall('track'):
1237 lang = track.attrib['lang_code']
1238 if lang in sub_lang_list:
1239 continue
1240 sub_formats = []
1241 for ext in self._SUBTITLE_FORMATS:
1242 params = compat_urllib_parse_urlencode({
1243 'lang': lang,
1244 'v': video_id,
1245 'fmt': ext,
1246 'name': track.attrib['name'].encode('utf-8'),
1247 })
1248 sub_formats.append({
1249 'url': 'https://www.youtube.com/api/timedtext?' + params,
1250 'ext': ext,
1251 })
1252 sub_lang_list[lang] = sub_formats
1253 if not sub_lang_list:
1254 self._downloader.report_warning('video doesn\'t have subtitles')
1255 return {}
1256 return sub_lang_list
1257
1258 def _get_ytplayer_config(self, video_id, webpage):
1259 patterns = (
1260 # User data may contain arbitrary character sequences that may affect
1261 # JSON extraction with regex, e.g. when '};' is contained the second
1262 # regex won't capture the whole JSON. Yet working around by trying more
1263 # concrete regex first keeping in mind proper quoted string handling
1264 # to be implemented in future that will replace this workaround (see
1265 # https://github.com/rg3/youtube-dl/issues/7468,
1266 # https://github.com/rg3/youtube-dl/pull/7599)
1267 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1268 r';ytplayer\.config\s*=\s*({.+?});',
1269 )
1270 config = self._search_regex(
1271 patterns, webpage, 'ytplayer.config', default=None)
1272 if config:
1273 return self._parse_json(
1274 uppercase_escape(config), video_id, fatal=False)
1275
1276 def _get_automatic_captions(self, video_id, webpage):
1277 """We need the webpage for getting the captions url, pass it as an
1278 argument to speed up the process."""
1279 self.to_screen('%s: Looking for automatic captions' % video_id)
1280 player_config = self._get_ytplayer_config(video_id, webpage)
1281 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
1282 if not player_config:
1283 self._downloader.report_warning(err_msg)
1284 return {}
1285 try:
1286 args = player_config['args']
1287 caption_url = args.get('ttsurl')
1288 if caption_url:
1289 timestamp = args['timestamp']
1290 # We get the available subtitles
1291 list_params = compat_urllib_parse_urlencode({
1292 'type': 'list',
1293 'tlangs': 1,
1294 'asrs': 1,
1295 })
1296 list_url = caption_url + '&' + list_params
1297 caption_list = self._download_xml(list_url, video_id)
1298 original_lang_node = caption_list.find('track')
1299 if original_lang_node is None:
1300 self._downloader.report_warning('Video doesn\'t have automatic captions')
1301 return {}
1302 original_lang = original_lang_node.attrib['lang_code']
1303 caption_kind = original_lang_node.attrib.get('kind', '')
1304
1305 sub_lang_list = {}
1306 for lang_node in caption_list.findall('target'):
1307 sub_lang = lang_node.attrib['lang_code']
1308 sub_formats = []
1309 for ext in self._SUBTITLE_FORMATS:
1310 params = compat_urllib_parse_urlencode({
1311 'lang': original_lang,
1312 'tlang': sub_lang,
1313 'fmt': ext,
1314 'ts': timestamp,
1315 'kind': caption_kind,
1316 })
1317 sub_formats.append({
1318 'url': caption_url + '&' + params,
1319 'ext': ext,
1320 })
1321 sub_lang_list[sub_lang] = sub_formats
1322 return sub_lang_list
1323
1324 def make_captions(sub_url, sub_langs):
1325 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1326 caption_qs = compat_parse_qs(parsed_sub_url.query)
1327 captions = {}
1328 for sub_lang in sub_langs:
1329 sub_formats = []
1330 for ext in self._SUBTITLE_FORMATS:
1331 caption_qs.update({
1332 'tlang': [sub_lang],
1333 'fmt': [ext],
1334 })
1335 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1336 query=compat_urllib_parse_urlencode(caption_qs, True)))
1337 sub_formats.append({
1338 'url': sub_url,
1339 'ext': ext,
1340 })
1341 captions[sub_lang] = sub_formats
1342 return captions
1343
1344 # New captions format as of 22.06.2017
1345 player_response = args.get('player_response')
1346 if player_response and isinstance(player_response, compat_str):
1347 player_response = self._parse_json(
1348 player_response, video_id, fatal=False)
1349 if player_response:
1350 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1351 base_url = renderer['captionTracks'][0]['baseUrl']
1352 sub_lang_list = []
1353 for lang in renderer['translationLanguages']:
1354 lang_code = lang.get('languageCode')
1355 if lang_code:
1356 sub_lang_list.append(lang_code)
1357 return make_captions(base_url, sub_lang_list)
1358
1359 # Some videos don't provide ttsurl but rather caption_tracks and
1360 # caption_translation_languages (e.g. 20LmZk1hakA)
1361 # Does not used anymore as of 22.06.2017
1362 caption_tracks = args['caption_tracks']
1363 caption_translation_languages = args['caption_translation_languages']
1364 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
1365 sub_lang_list = []
1366 for lang in caption_translation_languages.split(','):
1367 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1368 sub_lang = lang_qs.get('lc', [None])[0]
1369 if sub_lang:
1370 sub_lang_list.append(sub_lang)
1371 return make_captions(caption_url, sub_lang_list)
1372 # An extractor error can be raise by the download process if there are
1373 # no automatic captions but there are subtitles
1374 except (KeyError, IndexError, ExtractorError):
1375 self._downloader.report_warning(err_msg)
1376 return {}
1377
1378 def _mark_watched(self, video_id, video_info):
1379 playback_url = video_info.get('videostats_playback_base_url', [None])[0]
1380 if not playback_url:
1381 return
1382 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1383 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1384
1385 # cpn generation algorithm is reverse engineered from base.js.
1386 # In fact it works even with dummy cpn.
1387 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1388 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1389
1390 qs.update({
1391 'ver': ['2'],
1392 'cpn': [cpn],
1393 })
1394 playback_url = compat_urlparse.urlunparse(
1395 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
1396
1397 self._download_webpage(
1398 playback_url, video_id, 'Marking watched',
1399 'Unable to mark watched', fatal=False)
1400
1401 @staticmethod
1402 def _extract_urls(webpage):
1403 # Embedded YouTube player
1404 entries = [
1405 unescapeHTML(mobj.group('url'))
1406 for mobj in re.finditer(r'''(?x)
1407 (?:
1408 <iframe[^>]+?src=|
1409 data-video-url=|
1410 <embed[^>]+?src=|
1411 embedSWF\(?:\s*|
1412 <object[^>]+data=|
1413 new\s+SWFObject\(
1414 )
1415 (["\'])
1416 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1417 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
1418 \1''', webpage)]
1419
1420 # lazyYT YouTube embed
1421 entries.extend(list(map(
1422 unescapeHTML,
1423 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1424
1425 # Wordpress "YouTube Video Importer" plugin
1426 matches = re.findall(r'''(?x)<div[^>]+
1427 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1428 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1429 entries.extend(m[-1] for m in matches)
1430
1431 return entries
1432
1433 @staticmethod
1434 def _extract_url(webpage):
1435 urls = YoutubeIE._extract_urls(webpage)
1436 return urls[0] if urls else None
1437
1438 @classmethod
1439 def extract_id(cls, url):
1440 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1441 if mobj is None:
1442 raise ExtractorError('Invalid URL: %s' % url)
1443 video_id = mobj.group(2)
1444 return video_id
1445
1446 def _extract_annotations(self, video_id):
1447 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1448 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1449
1450 @staticmethod
1451 def _extract_chapters(description, duration):
1452 if not description:
1453 return None
1454 chapter_lines = re.findall(
1455 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1456 description)
1457 if not chapter_lines:
1458 return None
1459 chapters = []
1460 for next_num, (chapter_line, time_point) in enumerate(
1461 chapter_lines, start=1):
1462 start_time = parse_duration(time_point)
1463 if start_time is None:
1464 continue
1465 if start_time > duration:
1466 break
1467 end_time = (duration if next_num == len(chapter_lines)
1468 else parse_duration(chapter_lines[next_num][1]))
1469 if end_time is None:
1470 continue
1471 if end_time > duration:
1472 end_time = duration
1473 if start_time > end_time:
1474 break
1475 chapter_title = re.sub(
1476 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1477 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1478 chapters.append({
1479 'start_time': start_time,
1480 'end_time': end_time,
1481 'title': chapter_title,
1482 })
1483 return chapters
1484
1485 def _real_extract(self, url):
1486 url, smuggled_data = unsmuggle_url(url, {})
1487
1488 proto = (
1489 'http' if self._downloader.params.get('prefer_insecure', False)
1490 else 'https')
1491
1492 start_time = None
1493 end_time = None
1494 parsed_url = compat_urllib_parse_urlparse(url)
1495 for component in [parsed_url.fragment, parsed_url.query]:
1496 query = compat_parse_qs(component)
1497 if start_time is None and 't' in query:
1498 start_time = parse_duration(query['t'][0])
1499 if start_time is None and 'start' in query:
1500 start_time = parse_duration(query['start'][0])
1501 if end_time is None and 'end' in query:
1502 end_time = parse_duration(query['end'][0])
1503
1504 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1505 mobj = re.search(self._NEXT_URL_RE, url)
1506 if mobj:
1507 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
1508 video_id = self.extract_id(url)
1509
1510 # Get video webpage
1511 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
1512 video_webpage = self._download_webpage(url, video_id)
1513
1514 # Attempt to extract SWF player URL
1515 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1516 if mobj is not None:
1517 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1518 else:
1519 player_url = None
1520
1521 dash_mpds = []
1522
1523 def add_dash_mpd(video_info):
1524 dash_mpd = video_info.get('dashmpd')
1525 if dash_mpd and dash_mpd[0] not in dash_mpds:
1526 dash_mpds.append(dash_mpd[0])
1527
1528 is_live = None
1529 view_count = None
1530
1531 def extract_view_count(v_info):
1532 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1533
1534 # Get video info
1535 embed_webpage = None
1536 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1537 age_gate = True
1538 # We simulate the access to the video from www.youtube.com/v/{video_id}
1539 # this can be viewed without login into Youtube
1540 url = proto + '://www.youtube.com/embed/%s' % video_id
1541 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
1542 data = compat_urllib_parse_urlencode({
1543 'video_id': video_id,
1544 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1545 'sts': self._search_regex(
1546 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1547 })
1548 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1549 video_info_webpage = self._download_webpage(
1550 video_info_url, video_id,
1551 note='Refetching age-gated info webpage',
1552 errnote='unable to download video info webpage')
1553 video_info = compat_parse_qs(video_info_webpage)
1554 add_dash_mpd(video_info)
1555 else:
1556 age_gate = False
1557 video_info = None
1558 sts = None
1559 # Try looking directly into the video webpage
1560 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1561 if ytplayer_config:
1562 args = ytplayer_config['args']
1563 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1564 # Convert to the same format returned by compat_parse_qs
1565 video_info = dict((k, [v]) for k, v in args.items())
1566 add_dash_mpd(video_info)
1567 # Rental video is not rented but preview is available (e.g.
1568 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1569 # https://github.com/rg3/youtube-dl/issues/10532)
1570 if not video_info and args.get('ypc_vid'):
1571 return self.url_result(
1572 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1573 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1574 is_live = True
1575 sts = ytplayer_config.get('sts')
1576 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1577 # We also try looking in get_video_info since it may contain different dashmpd
1578 # URL that points to a DASH manifest with possibly different itag set (some itags
1579 # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
1580 # manifest pointed by get_video_info's dashmpd).
1581 # The general idea is to take a union of itags of both DASH manifests (for example
1582 # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
1583 self.report_video_info_webpage_download(video_id)
1584 for el in ('info', 'embedded', 'detailpage', 'vevo', ''):
1585 query = {
1586 'video_id': video_id,
1587 'ps': 'default',
1588 'eurl': '',
1589 'gl': 'US',
1590 'hl': 'en',
1591 }
1592 if el:
1593 query['el'] = el
1594 if sts:
1595 query['sts'] = sts
1596 video_info_webpage = self._download_webpage(
1597 '%s://www.youtube.com/get_video_info' % proto,
1598 video_id, note=False,
1599 errnote='unable to download video info webpage',
1600 fatal=False, query=query)
1601 if not video_info_webpage:
1602 continue
1603 get_video_info = compat_parse_qs(video_info_webpage)
1604 add_dash_mpd(get_video_info)
1605 if view_count is None:
1606 view_count = extract_view_count(get_video_info)
1607 if not video_info:
1608 video_info = get_video_info
1609 if 'token' in get_video_info:
1610 # Different get_video_info requests may report different results, e.g.
1611 # some may report video unavailability, but some may serve it without
1612 # any complaint (see https://github.com/rg3/youtube-dl/issues/7362,
1613 # the original webpage as well as el=info and el=embedded get_video_info
1614 # requests report video unavailability due to geo restriction while
1615 # el=detailpage succeeds and returns valid data). This is probably
1616 # due to YouTube measures against IP ranges of hosting providers.
1617 # Working around by preferring the first succeeded video_info containing
1618 # the token if no such video_info yet was found.
1619 if 'token' not in video_info:
1620 video_info = get_video_info
1621 break
1622
1623 def extract_unavailable_message():
1624 return self._html_search_regex(
1625 r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>',
1626 video_webpage, 'unavailable message', default=None)
1627
1628 if 'token' not in video_info:
1629 if 'reason' in video_info:
1630 if 'The uploader has not made this video available in your country.' in video_info['reason']:
1631 regions_allowed = self._html_search_meta(
1632 'regionsAllowed', video_webpage, default=None)
1633 countries = regions_allowed.split(',') if regions_allowed else None
1634 self.raise_geo_restricted(
1635 msg=video_info['reason'][0], countries=countries)
1636 reason = video_info['reason'][0]
1637 if 'Invalid parameters' in reason:
1638 unavailable_message = extract_unavailable_message()
1639 if unavailable_message:
1640 reason = unavailable_message
1641 raise ExtractorError(
1642 'YouTube said: %s' % reason,
1643 expected=True, video_id=video_id)
1644 else:
1645 raise ExtractorError(
1646 '"token" parameter not in video info for unknown reason',
1647 video_id=video_id)
1648
1649 # title
1650 if 'title' in video_info:
1651 video_title = video_info['title'][0]
1652 else:
1653 self._downloader.report_warning('Unable to extract video title')
1654 video_title = '_'
1655
1656 # description
1657 description_original = video_description = get_element_by_id("eow-description", video_webpage)
1658 if video_description:
1659
1660 def replace_url(m):
1661 redir_url = compat_urlparse.urljoin(url, m.group(1))
1662 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1663 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1664 qs = compat_parse_qs(parsed_redir_url.query)
1665 q = qs.get('q')
1666 if q and q[0]:
1667 return q[0]
1668 return redir_url
1669
1670 description_original = video_description = re.sub(r'''(?x)
1671 <a\s+
1672 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1673 (?:title|href)="([^"]+)"\s+
1674 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1675 class="[^"]*"[^>]*>
1676 [^<]+\.{3}\s*
1677 </a>
1678 ''', replace_url, video_description)
1679 video_description = clean_html(video_description)
1680 else:
1681 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1682 if fd_mobj:
1683 video_description = unescapeHTML(fd_mobj.group(1))
1684 else:
1685 video_description = ''
1686
1687 if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False):
1688 if not self._downloader.params.get('noplaylist'):
1689 entries = []
1690 feed_ids = []
1691 multifeed_metadata_list = video_info['multifeed_metadata_list'][0]
1692 for feed in multifeed_metadata_list.split(','):
1693 # Unquote should take place before split on comma (,) since textual
1694 # fields may contain comma as well (see
1695 # https://github.com/rg3/youtube-dl/issues/8536)
1696 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
1697 entries.append({
1698 '_type': 'url_transparent',
1699 'ie_key': 'Youtube',
1700 'url': smuggle_url(
1701 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1702 {'force_singlefeed': True}),
1703 'title': '%s (%s)' % (video_title, feed_data['title'][0]),
1704 })
1705 feed_ids.append(feed_data['id'][0])
1706 self.to_screen(
1707 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1708 % (', '.join(feed_ids), video_id))
1709 return self.playlist_result(entries, video_id, video_title, video_description)
1710 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1711
1712 if view_count is None:
1713 view_count = extract_view_count(video_info)
1714
1715 # Check for "rental" videos
1716 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1717 raise ExtractorError('"rental" videos not supported. See https://github.com/rg3/youtube-dl/issues/359 for more information.', expected=True)
1718
1719 def _extract_filesize(media_url):
1720 return int_or_none(self._search_regex(
1721 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
1722
1723 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1724 self.report_rtmp_download()
1725 formats = [{
1726 'format_id': '_rtmp',
1727 'protocol': 'rtmp',
1728 'url': video_info['conn'][0],
1729 'player_url': player_url,
1730 }]
1731 elif not is_live and (len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
1732 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
1733 if 'rtmpe%3Dyes' in encoded_url_map:
1734 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1735 formats_spec = {}
1736 fmt_list = video_info.get('fmt_list', [''])[0]
1737 if fmt_list:
1738 for fmt in fmt_list.split(','):
1739 spec = fmt.split('/')
1740 if len(spec) > 1:
1741 width_height = spec[1].split('x')
1742 if len(width_height) == 2:
1743 formats_spec[spec[0]] = {
1744 'resolution': spec[1],
1745 'width': int_or_none(width_height[0]),
1746 'height': int_or_none(width_height[1]),
1747 }
1748 q = qualities(['small', 'medium', 'hd720'])
1749 formats = []
1750 for url_data_str in encoded_url_map.split(','):
1751 url_data = compat_parse_qs(url_data_str)
1752 if 'itag' not in url_data or 'url' not in url_data:
1753 continue
1754 format_id = url_data['itag'][0]
1755 url = url_data['url'][0]
1756
1757 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
1758 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
1759 jsplayer_url_json = self._search_regex(
1760 ASSETS_RE,
1761 embed_webpage if age_gate else video_webpage,
1762 'JS player URL (1)', default=None)
1763 if not jsplayer_url_json and not age_gate:
1764 # We need the embed website after all
1765 if embed_webpage is None:
1766 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1767 embed_webpage = self._download_webpage(
1768 embed_url, video_id, 'Downloading embed webpage')
1769 jsplayer_url_json = self._search_regex(
1770 ASSETS_RE, embed_webpage, 'JS player URL')
1771
1772 player_url = json.loads(jsplayer_url_json)
1773 if player_url is None:
1774 player_url_json = self._search_regex(
1775 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
1776 video_webpage, 'age gate player URL')
1777 player_url = json.loads(player_url_json)
1778
1779 if 'sig' in url_data:
1780 url += '&signature=' + url_data['sig'][0]
1781 elif 's' in url_data:
1782 encrypted_sig = url_data['s'][0]
1783
1784 if self._downloader.params.get('verbose'):
1785 if player_url is None:
1786 player_version = 'unknown'
1787 player_desc = 'unknown'
1788 else:
1789 if player_url.endswith('swf'):
1790 player_version = self._search_regex(
1791 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
1792 'flash player', fatal=False)
1793 player_desc = 'flash player %s' % player_version
1794 else:
1795 player_version = self._search_regex(
1796 [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',
1797 r'(?:www|player)-([^/]+)(?:/[a-z]{2}_[A-Z]{2})?/base\.js'],
1798 player_url,
1799 'html5 player', fatal=False)
1800 player_desc = 'html5 player %s' % player_version
1801
1802 parts_sizes = self._signature_cache_id(encrypted_sig)
1803 self.to_screen('{%s} signature length %s, %s' %
1804 (format_id, parts_sizes, player_desc))
1805
1806 signature = self._decrypt_signature(
1807 encrypted_sig, video_id, player_url, age_gate)
1808 url += '&signature=' + signature
1809 if 'ratebypass' not in url:
1810 url += '&ratebypass=yes'
1811
1812 dct = {
1813 'format_id': format_id,
1814 'url': url,
1815 'player_url': player_url,
1816 }
1817 if format_id in self._formats:
1818 dct.update(self._formats[format_id])
1819 if format_id in formats_spec:
1820 dct.update(formats_spec[format_id])
1821
1822 # Some itags are not included in DASH manifest thus corresponding formats will
1823 # lack metadata (see https://github.com/rg3/youtube-dl/pull/5993).
1824 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
1825 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
1826 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
1827
1828 filesize = int_or_none(url_data.get(
1829 'clen', [None])[0]) or _extract_filesize(url)
1830
1831 quality = url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0]
1832
1833 more_fields = {
1834 'filesize': filesize,
1835 'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000),
1836 'width': width,
1837 'height': height,
1838 'fps': int_or_none(url_data.get('fps', [None])[0]),
1839 'format_note': quality,
1840 'quality': q(quality),
1841 }
1842 for key, value in more_fields.items():
1843 if value:
1844 dct[key] = value
1845 type_ = url_data.get('type', [None])[0]
1846 if type_:
1847 type_split = type_.split(';')
1848 kind_ext = type_split[0].split('/')
1849 if len(kind_ext) == 2:
1850 kind, _ = kind_ext
1851 dct['ext'] = mimetype2ext(type_split[0])
1852 if kind in ('audio', 'video'):
1853 codecs = None
1854 for mobj in re.finditer(
1855 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
1856 if mobj.group('key') == 'codecs':
1857 codecs = mobj.group('val')
1858 break
1859 if codecs:
1860 dct.update(parse_codecs(codecs))
1861 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
1862 dct['downloader_options'] = {
1863 # Youtube throttles chunks >~10M
1864 'http_chunk_size': 10485760,
1865 }
1866 formats.append(dct)
1867 elif video_info.get('hlsvp'):
1868 manifest_url = video_info['hlsvp'][0]
1869 formats = []
1870 m3u8_formats = self._extract_m3u8_formats(
1871 manifest_url, video_id, 'mp4', fatal=False)
1872 for a_format in m3u8_formats:
1873 itag = self._search_regex(
1874 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
1875 if itag:
1876 a_format['format_id'] = itag
1877 if itag in self._formats:
1878 dct = self._formats[itag].copy()
1879 dct.update(a_format)
1880 a_format = dct
1881 a_format['player_url'] = player_url
1882 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
1883 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
1884 formats.append(a_format)
1885 else:
1886 error_message = clean_html(video_info.get('reason', [None])[0])
1887 if not error_message:
1888 error_message = extract_unavailable_message()
1889 if error_message:
1890 raise ExtractorError(error_message, expected=True)
1891 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1892
1893 # uploader
1894 video_uploader = try_get(video_info, lambda x: x['author'][0], compat_str)
1895 if video_uploader:
1896 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
1897 else:
1898 self._downloader.report_warning('unable to extract uploader name')
1899
1900 # uploader_id
1901 video_uploader_id = None
1902 video_uploader_url = None
1903 mobj = re.search(
1904 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
1905 video_webpage)
1906 if mobj is not None:
1907 video_uploader_id = mobj.group('uploader_id')
1908 video_uploader_url = mobj.group('uploader_url')
1909 else:
1910 self._downloader.report_warning('unable to extract uploader nickname')
1911
1912 channel_id = self._html_search_meta(
1913 'channelId', video_webpage, 'channel id')
1914 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
1915
1916 # thumbnail image
1917 # We try first to get a high quality image:
1918 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1919 video_webpage, re.DOTALL)
1920 if m_thumb is not None:
1921 video_thumbnail = m_thumb.group(1)
1922 elif 'thumbnail_url' not in video_info:
1923 self._downloader.report_warning('unable to extract video thumbnail')
1924 video_thumbnail = None
1925 else: # don't panic if we can't find it
1926 video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
1927
1928 # upload date
1929 upload_date = self._html_search_meta(
1930 'datePublished', video_webpage, 'upload date', default=None)
1931 if not upload_date:
1932 upload_date = self._search_regex(
1933 [r'(?s)id="eow-date.*?>(.*?)</span>',
1934 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
1935 video_webpage, 'upload date', default=None)
1936 upload_date = unified_strdate(upload_date)
1937
1938 video_license = self._html_search_regex(
1939 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
1940 video_webpage, 'license', default=None)
1941
1942 m_music = re.search(
1943 r'''(?x)
1944 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
1945 <ul[^>]*>\s*
1946 <li>(?P<title>.+?)
1947 by (?P<creator>.+?)
1948 (?:
1949 \(.+?\)|
1950 <a[^>]*
1951 (?:
1952 \bhref=["\']/red[^>]*>| # drop possible
1953 >\s*Listen ad-free with YouTube Red # YouTube Red ad
1954 )
1955 .*?
1956 )?</li
1957 ''',
1958 video_webpage)
1959 if m_music:
1960 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
1961 video_creator = clean_html(m_music.group('creator'))
1962 else:
1963 video_alt_title = video_creator = None
1964
1965 def extract_meta(field):
1966 return self._html_search_regex(
1967 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
1968 video_webpage, field, default=None)
1969
1970 track = extract_meta('Song')
1971 artist = extract_meta('Artist')
1972
1973 m_episode = re.search(
1974 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
1975 video_webpage)
1976 if m_episode:
1977 series = m_episode.group('series')
1978 season_number = int(m_episode.group('season'))
1979 episode_number = int(m_episode.group('episode'))
1980 else:
1981 series = season_number = episode_number = None
1982
1983 m_cat_container = self._search_regex(
1984 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
1985 video_webpage, 'categories', default=None)
1986 if m_cat_container:
1987 category = self._html_search_regex(
1988 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
1989 default=None)
1990 video_categories = None if category is None else [category]
1991 else:
1992 video_categories = None
1993
1994 video_tags = [
1995 unescapeHTML(m.group('content'))
1996 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
1997
1998 def _extract_count(count_name):
1999 return str_to_int(self._search_regex(
2000 r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
2001 % re.escape(count_name),
2002 video_webpage, count_name, default=None))
2003
2004 like_count = _extract_count('like')
2005 dislike_count = _extract_count('dislike')
2006
2007 # subtitles
2008 video_subtitles = self.extract_subtitles(video_id, video_webpage)
2009 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
2010
2011 video_duration = try_get(
2012 video_info, lambda x: int_or_none(x['length_seconds'][0]))
2013 if not video_duration:
2014 video_duration = parse_duration(self._html_search_meta(
2015 'duration', video_webpage, 'video duration'))
2016
2017 # annotations
2018 video_annotations = None
2019 if self._downloader.params.get('writeannotations', False):
2020 video_annotations = self._extract_annotations(video_id)
2021
2022 chapters = self._extract_chapters(description_original, video_duration)
2023
2024 # Look for the DASH manifest
2025 if self._downloader.params.get('youtube_include_dash_manifest', True):
2026 dash_mpd_fatal = True
2027 for mpd_url in dash_mpds:
2028 dash_formats = {}
2029 try:
2030 def decrypt_sig(mobj):
2031 s = mobj.group(1)
2032 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2033 return '/signature/%s' % dec_s
2034
2035 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2036
2037 for df in self._extract_mpd_formats(
2038 mpd_url, video_id, fatal=dash_mpd_fatal,
2039 formats_dict=self._formats):
2040 if not df.get('filesize'):
2041 df['filesize'] = _extract_filesize(df['url'])
2042 # Do not overwrite DASH format found in some previous DASH manifest
2043 if df['format_id'] not in dash_formats:
2044 dash_formats[df['format_id']] = df
2045 # Additional DASH manifests may end up in HTTP Error 403 therefore
2046 # allow them to fail without bug report message if we already have
2047 # some DASH manifest succeeded. This is temporary workaround to reduce
2048 # burst of bug reports until we figure out the reason and whether it
2049 # can be fixed at all.
2050 dash_mpd_fatal = False
2051 except (ExtractorError, KeyError) as e:
2052 self.report_warning(
2053 'Skipping DASH manifest: %r' % e, video_id)
2054 if dash_formats:
2055 # Remove the formats we found through non-DASH, they
2056 # contain less info and it can be wrong, because we use
2057 # fixed values (for example the resolution). See
2058 # https://github.com/rg3/youtube-dl/issues/5774 for an
2059 # example.
2060 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
2061 formats.extend(dash_formats.values())
2062
2063 # Check for malformed aspect ratio
2064 stretched_m = re.search(
2065 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2066 video_webpage)
2067 if stretched_m:
2068 w = float(stretched_m.group('w'))
2069 h = float(stretched_m.group('h'))
2070 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2071 # We will only process correct ratios.
2072 if w > 0 and h > 0:
2073 ratio = w / h
2074 for f in formats:
2075 if f.get('vcodec') != 'none':
2076 f['stretched_ratio'] = ratio
2077
2078 self._sort_formats(formats)
2079
2080 self.mark_watched(video_id, video_info)
2081
2082 return {
2083 'id': video_id,
2084 'uploader': video_uploader,
2085 'uploader_id': video_uploader_id,
2086 'uploader_url': video_uploader_url,
2087 'channel_id': channel_id,
2088 'channel_url': channel_url,
2089 'upload_date': upload_date,
2090 'license': video_license,
2091 'creator': video_creator or artist,
2092 'title': video_title,
2093 'alt_title': video_alt_title or track,
2094 'thumbnail': video_thumbnail,
2095 'description': video_description,
2096 'categories': video_categories,
2097 'tags': video_tags,
2098 'subtitles': video_subtitles,
2099 'automatic_captions': automatic_captions,
2100 'duration': video_duration,
2101 'age_limit': 18 if age_gate else 0,
2102 'annotations': video_annotations,
2103 'chapters': chapters,
2104 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
2105 'view_count': view_count,
2106 'like_count': like_count,
2107 'dislike_count': dislike_count,
2108 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
2109 'formats': formats,
2110 'is_live': is_live,
2111 'start_time': start_time,
2112 'end_time': end_time,
2113 'series': series,
2114 'season_number': season_number,
2115 'episode_number': episode_number,
2116 'track': track,
2117 'artist': artist,
2118 }
2119
2120
2121 class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
2122 IE_DESC = 'YouTube.com playlists'
2123 _VALID_URL = r"""(?x)(?:
2124 (?:https?://)?
2125 (?:\w+\.)?
2126 (?:
2127 youtube\.com/
2128 (?:
2129 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))
2130 \? (?:.*?[&;])*? (?:p|a|list)=
2131 | p/
2132 )|
2133 youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
2134 )
2135 (
2136 (?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)?[0-9A-Za-z-_]{10,}
2137 # Top tracks, they can also include dots
2138 |(?:MC)[\w\.]*
2139 )
2140 .*
2141 |
2142 (%(playlist_id)s)
2143 )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
2144 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
2145 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'
2146 IE_NAME = 'youtube:playlist'
2147 _TESTS = [{
2148 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
2149 'info_dict': {
2150 'title': 'ytdl test PL',
2151 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
2152 },
2153 'playlist_count': 3,
2154 }, {
2155 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
2156 'info_dict': {
2157 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
2158 'title': 'YDL_Empty_List',
2159 },
2160 'playlist_count': 0,
2161 'skip': 'This playlist is private',
2162 }, {
2163 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2164 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2165 'info_dict': {
2166 'title': '29C3: Not my department',
2167 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2168 },
2169 'playlist_count': 95,
2170 }, {
2171 'note': 'issue #673',
2172 'url': 'PLBB231211A4F62143',
2173 'info_dict': {
2174 'title': '[OLD]Team Fortress 2 (Class-based LP)',
2175 'id': 'PLBB231211A4F62143',
2176 },
2177 'playlist_mincount': 26,
2178 }, {
2179 'note': 'Large playlist',
2180 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
2181 'info_dict': {
2182 'title': 'Uploads from Cauchemar',
2183 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
2184 },
2185 'playlist_mincount': 799,
2186 }, {
2187 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2188 'info_dict': {
2189 'title': 'YDL_safe_search',
2190 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2191 },
2192 'playlist_count': 2,
2193 'skip': 'This playlist is private',
2194 }, {
2195 'note': 'embedded',
2196 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
2197 'playlist_count': 4,
2198 'info_dict': {
2199 'title': 'JODA15',
2200 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
2201 }
2202 }, {
2203 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2204 'playlist_mincount': 485,
2205 'info_dict': {
2206 'title': '2017 華語最新單曲 (2/24更新)',
2207 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2208 }
2209 }, {
2210 'note': 'Embedded SWF player',
2211 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
2212 'playlist_count': 4,
2213 'info_dict': {
2214 'title': 'JODA7',
2215 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
2216 }
2217 }, {
2218 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2219 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2220 'info_dict': {
2221 'title': 'Uploads from Interstellar Movie',
2222 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
2223 },
2224 'playlist_mincount': 21,
2225 }, {
2226 # Playlist URL that does not actually serve a playlist
2227 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2228 'info_dict': {
2229 'id': 'FqZTN594JQw',
2230 'ext': 'webm',
2231 'title': "Smiley's People 01 detective, Adventure Series, Action",
2232 'uploader': 'STREEM',
2233 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
2234 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
2235 'upload_date': '20150526',
2236 'license': 'Standard YouTube License',
2237 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2238 'categories': ['People & Blogs'],
2239 'tags': list,
2240 'like_count': int,
2241 'dislike_count': int,
2242 },
2243 'params': {
2244 'skip_download': True,
2245 },
2246 'add_ie': [YoutubeIE.ie_key()],
2247 }, {
2248 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
2249 'info_dict': {
2250 'id': 'yeWKywCrFtk',
2251 'ext': 'mp4',
2252 'title': 'Small Scale Baler and Braiding Rugs',
2253 'uploader': 'Backus-Page House Museum',
2254 'uploader_id': 'backuspagemuseum',
2255 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
2256 'upload_date': '20161008',
2257 'license': 'Standard YouTube License',
2258 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
2259 'categories': ['Nonprofits & Activism'],
2260 'tags': list,
2261 'like_count': int,
2262 'dislike_count': int,
2263 },
2264 'params': {
2265 'noplaylist': True,
2266 'skip_download': True,
2267 },
2268 }, {
2269 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
2270 'only_matching': True,
2271 }, {
2272 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
2273 'only_matching': True,
2274 }, {
2275 # music album playlist
2276 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
2277 'only_matching': True,
2278 }]
2279
2280 def _real_initialize(self):
2281 self._login()
2282
2283 def _extract_mix(self, playlist_id):
2284 # The mixes are generated from a single video
2285 # the id of the playlist is just 'RD' + video_id
2286 ids = []
2287 last_id = playlist_id[-11:]
2288 for n in itertools.count(1):
2289 url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
2290 webpage = self._download_webpage(
2291 url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))
2292 new_ids = orderedSet(re.findall(
2293 r'''(?xs)data-video-username=".*?".*?
2294 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
2295 webpage))
2296 # Fetch new pages until all the videos are repeated, it seems that
2297 # there are always 51 unique videos.
2298 new_ids = [_id for _id in new_ids if _id not in ids]
2299 if not new_ids:
2300 break
2301 ids.extend(new_ids)
2302 last_id = ids[-1]
2303
2304 url_results = self._ids_to_results(ids)
2305
2306 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
2307 title_span = (
2308 search_title('playlist-title') or
2309 search_title('title long-title') or
2310 search_title('title'))
2311 title = clean_html(title_span)
2312
2313 return self.playlist_result(url_results, playlist_id, title)
2314
2315 def _extract_playlist(self, playlist_id):
2316 url = self._TEMPLATE_URL % playlist_id
2317 page = self._download_webpage(url, playlist_id)
2318
2319 # the yt-alert-message now has tabindex attribute (see https://github.com/rg3/youtube-dl/issues/11604)
2320 for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page):
2321 match = match.strip()
2322 # Check if the playlist exists or is private
2323 mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match)
2324 if mobj:
2325 reason = mobj.group('reason')
2326 message = 'This playlist %s' % reason
2327 if 'private' in reason:
2328 message += ', use --username or --netrc to access it'
2329 message += '.'
2330 raise ExtractorError(message, expected=True)
2331 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
2332 raise ExtractorError(
2333 'Invalid parameters. Maybe URL is incorrect.',
2334 expected=True)
2335 elif re.match(r'[^<]*Choose your language[^<]*', match):
2336 continue
2337 else:
2338 self.report_warning('Youtube gives an alert message: ' + match)
2339
2340 playlist_title = self._html_search_regex(
2341 r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
2342 page, 'title', default=None)
2343
2344 _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref='
2345 uploader = self._search_regex(
2346 r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,
2347 page, 'uploader', default=None)
2348 mobj = re.search(
2349 r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE,
2350 page)
2351 if mobj:
2352 uploader_id = mobj.group('uploader_id')
2353 uploader_url = compat_urlparse.urljoin(url, mobj.group('path'))
2354 else:
2355 uploader_id = uploader_url = None
2356
2357 has_videos = True
2358
2359 if not playlist_title:
2360 try:
2361 # Some playlist URLs don't actually serve a playlist (e.g.
2362 # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)
2363 next(self._entries(page, playlist_id))
2364 except StopIteration:
2365 has_videos = False
2366
2367 playlist = self.playlist_result(
2368 self._entries(page, playlist_id), playlist_id, playlist_title)
2369 playlist.update({
2370 'uploader': uploader,
2371 'uploader_id': uploader_id,
2372 'uploader_url': uploader_url,
2373 })
2374
2375 return has_videos, playlist
2376
2377 def _check_download_just_video(self, url, playlist_id):
2378 # Check if it's a video-specific URL
2379 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
2380 video_id = query_dict.get('v', [None])[0] or self._search_regex(
2381 r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url,
2382 'video id', default=None)
2383 if video_id:
2384 if self._downloader.params.get('noplaylist'):
2385 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
2386 return video_id, self.url_result(video_id, 'Youtube', video_id=video_id)
2387 else:
2388 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
2389 return video_id, None
2390 return None, None
2391
2392 def _real_extract(self, url):
2393 # Extract playlist id
2394 mobj = re.match(self._VALID_URL, url)
2395 if mobj is None:
2396 raise ExtractorError('Invalid URL: %s' % url)
2397 playlist_id = mobj.group(1) or mobj.group(2)
2398
2399 video_id, video = self._check_download_just_video(url, playlist_id)
2400 if video:
2401 return video
2402
2403 if playlist_id.startswith(('RD', 'UL', 'PU')):
2404 # Mixes require a custom extraction process
2405 return self._extract_mix(playlist_id)
2406
2407 has_videos, playlist = self._extract_playlist(playlist_id)
2408 if has_videos or not video_id:
2409 return playlist
2410
2411 # Some playlist URLs don't actually serve a playlist (see
2412 # https://github.com/rg3/youtube-dl/issues/10537).
2413 # Fallback to plain video extraction if there is a video id
2414 # along with playlist id.
2415 return self.url_result(video_id, 'Youtube', video_id=video_id)
2416
2417
2418 class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
2419 IE_DESC = 'YouTube.com channels'
2420 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
2421 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
2422 _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
2423 IE_NAME = 'youtube:channel'
2424 _TESTS = [{
2425 'note': 'paginated channel',
2426 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
2427 'playlist_mincount': 91,
2428 'info_dict': {
2429 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
2430 'title': 'Uploads from lex will',
2431 }
2432 }, {
2433 'note': 'Age restricted channel',
2434 # from https://www.youtube.com/user/DeusExOfficial
2435 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',
2436 'playlist_mincount': 64,
2437 'info_dict': {
2438 'id': 'UUs0ifCMCm1icqRbqhUINa0w',
2439 'title': 'Uploads from Deus Ex',
2440 },
2441 }]
2442
2443 @classmethod
2444 def suitable(cls, url):
2445 return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)
2446 else super(YoutubeChannelIE, cls).suitable(url))
2447
2448 def _build_template_url(self, url, channel_id):
2449 return self._TEMPLATE_URL % channel_id
2450
2451 def _real_extract(self, url):
2452 channel_id = self._match_id(url)
2453
2454 url = self._build_template_url(url, channel_id)
2455
2456 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
2457 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
2458 # otherwise fallback on channel by page extraction
2459 channel_page = self._download_webpage(
2460 url + '?view=57', channel_id,
2461 'Downloading channel page', fatal=False)
2462 if channel_page is False:
2463 channel_playlist_id = False
2464 else:
2465 channel_playlist_id = self._html_search_meta(
2466 'channelId', channel_page, 'channel id', default=None)
2467 if not channel_playlist_id:
2468 channel_url = self._html_search_meta(
2469 ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),
2470 channel_page, 'channel url', default=None)
2471 if channel_url:
2472 channel_playlist_id = self._search_regex(
2473 r'vnd\.youtube://user/([0-9A-Za-z_-]+)',
2474 channel_url, 'channel id', default=None)
2475 if channel_playlist_id and channel_playlist_id.startswith('UC'):
2476 playlist_id = 'UU' + channel_playlist_id[2:]
2477 return self.url_result(
2478 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
2479
2480 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
2481 autogenerated = re.search(r'''(?x)
2482 class="[^"]*?(?:
2483 channel-header-autogenerated-label|
2484 yt-channel-title-autogenerated
2485 )[^"]*"''', channel_page) is not None
2486
2487 if autogenerated:
2488 # The videos are contained in a single page
2489 # the ajax pages can't be used, they are empty
2490 entries = [
2491 self.url_result(
2492 video_id, 'Youtube', video_id=video_id,
2493 video_title=video_title)
2494 for video_id, video_title in self.extract_videos_from_page(channel_page)]
2495 return self.playlist_result(entries, channel_id)
2496
2497 try:
2498 next(self._entries(channel_page, channel_id))
2499 except StopIteration:
2500 alert_message = self._html_search_regex(
2501 r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',
2502 channel_page, 'alert', default=None, group='alert')
2503 if alert_message:
2504 raise ExtractorError('Youtube said: %s' % alert_message, expected=True)
2505
2506 return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
2507
2508
2509 class YoutubeUserIE(YoutubeChannelIE):
2510 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
2511 _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
2512 _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
2513 IE_NAME = 'youtube:user'
2514
2515 _TESTS = [{
2516 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
2517 'playlist_mincount': 320,
2518 'info_dict': {
2519 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
2520 'title': 'Uploads from The Linux Foundation',
2521 }
2522 }, {
2523 # Only available via https://www.youtube.com/c/12minuteathlete/videos
2524 # but not https://www.youtube.com/user/12minuteathlete/videos
2525 'url': 'https://www.youtube.com/c/12minuteathlete/videos',
2526 'playlist_mincount': 249,
2527 'info_dict': {
2528 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
2529 'title': 'Uploads from 12 Minute Athlete',
2530 }
2531 }, {
2532 'url': 'ytuser:phihag',
2533 'only_matching': True,
2534 }, {
2535 'url': 'https://www.youtube.com/c/gametrailers',
2536 'only_matching': True,
2537 }, {
2538 'url': 'https://www.youtube.com/gametrailers',
2539 'only_matching': True,
2540 }, {
2541 # This channel is not available, geo restricted to JP
2542 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',
2543 'only_matching': True,
2544 }]
2545
2546 @classmethod
2547 def suitable(cls, url):
2548 # Don't return True if the url can be extracted with other youtube
2549 # extractor, the regex would is too permissive and it would match.
2550 other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)
2551 if any(ie.suitable(url) for ie in other_yt_ies):
2552 return False
2553 else:
2554 return super(YoutubeUserIE, cls).suitable(url)
2555
2556 def _build_template_url(self, url, channel_id):
2557 mobj = re.match(self._VALID_URL, url)
2558 return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))
2559
2560
2561 class YoutubeLiveIE(YoutubeBaseInfoExtractor):
2562 IE_DESC = 'YouTube.com live streams'
2563 _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'
2564 IE_NAME = 'youtube:live'
2565
2566 _TESTS = [{
2567 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
2568 'info_dict': {
2569 'id': 'a48o2S1cPoo',
2570 'ext': 'mp4',
2571 'title': 'The Young Turks - Live Main Show',
2572 'uploader': 'The Young Turks',
2573 'uploader_id': 'TheYoungTurks',
2574 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
2575 'upload_date': '20150715',
2576 'license': 'Standard YouTube License',
2577 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
2578 'categories': ['News & Politics'],
2579 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
2580 'like_count': int,
2581 'dislike_count': int,
2582 },
2583 'params': {
2584 'skip_download': True,
2585 },
2586 }, {
2587 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
2588 'only_matching': True,
2589 }, {
2590 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
2591 'only_matching': True,
2592 }, {
2593 'url': 'https://www.youtube.com/TheYoungTurks/live',
2594 'only_matching': True,
2595 }]
2596
2597 def _real_extract(self, url):
2598 mobj = re.match(self._VALID_URL, url)
2599 channel_id = mobj.group('id')
2600 base_url = mobj.group('base_url')
2601 webpage = self._download_webpage(url, channel_id, fatal=False)
2602 if webpage:
2603 page_type = self._og_search_property(
2604 'type', webpage, 'page type', default='')
2605 video_id = self._html_search_meta(
2606 'videoId', webpage, 'video id', default=None)
2607 if page_type.startswith('video') and video_id and re.match(
2608 r'^[0-9A-Za-z_-]{11}$', video_id):
2609 return self.url_result(video_id, YoutubeIE.ie_key())
2610 return self.url_result(base_url)
2611
2612
2613 class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
2614 IE_DESC = 'YouTube.com user/channel playlists'
2615 _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists'
2616 IE_NAME = 'youtube:playlists'
2617
2618 _TESTS = [{
2619 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
2620 'playlist_mincount': 4,
2621 'info_dict': {
2622 'id': 'ThirstForScience',
2623 'title': 'Thirst for Science',
2624 },
2625 }, {
2626 # with "Load more" button
2627 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
2628 'playlist_mincount': 70,
2629 'info_dict': {
2630 'id': 'igorkle1',
2631 'title': 'Игорь Клейнер',
2632 },
2633 }, {
2634 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',
2635 'playlist_mincount': 17,
2636 'info_dict': {
2637 'id': 'UCiU1dHvZObB2iP6xkJ__Icw',
2638 'title': 'Chem Player',
2639 },
2640 }]
2641
2642
2643 class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):
2644 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'
2645
2646
2647 class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
2648 IE_DESC = 'YouTube.com searches'
2649 # there doesn't appear to be a real limit, for example if you search for
2650 # 'python' you get more than 8.000.000 results
2651 _MAX_RESULTS = float('inf')
2652 IE_NAME = 'youtube:search'
2653 _SEARCH_KEY = 'ytsearch'
2654 _EXTRA_QUERY_ARGS = {}
2655 _TESTS = []
2656
2657 def _get_n_results(self, query, n):
2658 """Get a specified number of results for a query"""
2659
2660 videos = []
2661 limit = n
2662
2663 url_query = {
2664 'search_query': query.encode('utf-8'),
2665 }
2666 url_query.update(self._EXTRA_QUERY_ARGS)
2667 result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query)
2668
2669 for pagenum in itertools.count(1):
2670 data = self._download_json(
2671 result_url, video_id='query "%s"' % query,
2672 note='Downloading page %s' % pagenum,
2673 errnote='Unable to download API page',
2674 query={'spf': 'navigate'})
2675 html_content = data[1]['body']['content']
2676
2677 if 'class="search-message' in html_content:
2678 raise ExtractorError(
2679 '[youtube] No video results', expected=True)
2680
2681 new_videos = list(self._process_page(html_content))
2682 videos += new_videos
2683 if not new_videos or len(videos) > limit:
2684 break
2685 next_link = self._html_search_regex(
2686 r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next',
2687 html_content, 'next link', default=None)
2688 if next_link is None:
2689 break
2690 result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link)
2691
2692 if len(videos) > n:
2693 videos = videos[:n]
2694 return self.playlist_result(videos, query)
2695
2696
2697 class YoutubeSearchDateIE(YoutubeSearchIE):
2698 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
2699 _SEARCH_KEY = 'ytsearchdate'
2700 IE_DESC = 'YouTube.com searches, newest videos first'
2701 _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
2702
2703
2704 class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):
2705 IE_DESC = 'YouTube.com search URLs'
2706 IE_NAME = 'youtube:search_url'
2707 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
2708 _TESTS = [{
2709 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
2710 'playlist_mincount': 5,
2711 'info_dict': {
2712 'title': 'youtube-dl test video',
2713 }
2714 }, {
2715 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
2716 'only_matching': True,
2717 }]
2718
2719 def _real_extract(self, url):
2720 mobj = re.match(self._VALID_URL, url)
2721 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
2722 webpage = self._download_webpage(url, query)
2723 return self.playlist_result(self._process_page(webpage), playlist_title=query)
2724
2725
2726 class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
2727 IE_DESC = 'YouTube.com (multi-season) shows'
2728 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
2729 IE_NAME = 'youtube:show'
2730 _TESTS = [{
2731 'url': 'https://www.youtube.com/show/airdisasters',
2732 'playlist_mincount': 5,
2733 'info_dict': {
2734 'id': 'airdisasters',
2735 'title': 'Air Disasters',
2736 }
2737 }]
2738
2739 def _real_extract(self, url):
2740 playlist_id = self._match_id(url)
2741 return super(YoutubeShowIE, self)._real_extract(
2742 'https://www.youtube.com/show/%s/playlists' % playlist_id)
2743
2744
2745 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
2746 """
2747 Base class for feed extractors
2748 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
2749 """
2750 _LOGIN_REQUIRED = True
2751
2752 @property
2753 def IE_NAME(self):
2754 return 'youtube:%s' % self._FEED_NAME
2755
2756 def _real_initialize(self):
2757 self._login()
2758
2759 def _entries(self, page):
2760 # The extraction process is the same as for playlists, but the regex
2761 # for the video ids doesn't contain an index
2762 ids = []
2763 more_widget_html = content_html = page
2764 for page_num in itertools.count(1):
2765 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
2766
2767 # 'recommended' feed has infinite 'load more' and each new portion spins
2768 # the same videos in (sometimes) slightly different order, so we'll check
2769 # for unicity and break when portion has no new videos
2770 new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches)))
2771 if not new_ids:
2772 break
2773
2774 ids.extend(new_ids)
2775
2776 for entry in self._ids_to_results(new_ids):
2777 yield entry
2778
2779 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
2780 if not mobj:
2781 break
2782
2783 more = self._download_json(
2784 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
2785 'Downloading page #%s' % page_num,
2786 transform_source=uppercase_escape)
2787 content_html = more['content_html']
2788 more_widget_html = more['load_more_widget_html']
2789
2790 def _real_extract(self, url):
2791 page = self._download_webpage(
2792 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
2793 self._PLAYLIST_TITLE)
2794 return self.playlist_result(
2795 self._entries(page), playlist_title=self._PLAYLIST_TITLE)
2796
2797
2798 class YoutubeWatchLaterIE(YoutubePlaylistIE):
2799 IE_NAME = 'youtube:watchlater'
2800 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
2801 _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'
2802
2803 _TESTS = [{
2804 'url': 'https://www.youtube.com/playlist?list=WL',
2805 'only_matching': True,
2806 }, {
2807 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',
2808 'only_matching': True,
2809 }]
2810
2811 def _real_extract(self, url):
2812 _, video = self._check_download_just_video(url, 'WL')
2813 if video:
2814 return video
2815 _, playlist = self._extract_playlist('WL')
2816 return playlist
2817
2818
2819 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
2820 IE_NAME = 'youtube:favorites'
2821 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
2822 _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
2823 _LOGIN_REQUIRED = True
2824
2825 def _real_extract(self, url):
2826 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
2827 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
2828 return self.url_result(playlist_id, 'YoutubePlaylist')
2829
2830
2831 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
2832 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
2833 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'
2834 _FEED_NAME = 'recommended'
2835 _PLAYLIST_TITLE = 'Youtube Recommended videos'
2836
2837
2838 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
2839 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
2840 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
2841 _FEED_NAME = 'subscriptions'
2842 _PLAYLIST_TITLE = 'Youtube Subscriptions'
2843
2844
2845 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
2846 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
2847 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
2848 _FEED_NAME = 'history'
2849 _PLAYLIST_TITLE = 'Youtube History'
2850
2851
2852 class YoutubeTruncatedURLIE(InfoExtractor):
2853 IE_NAME = 'youtube:truncated_url'
2854 IE_DESC = False # Do not list
2855 _VALID_URL = r'''(?x)
2856 (?:https?://)?
2857 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
2858 (?:watch\?(?:
2859 feature=[a-z_]+|
2860 annotation_id=annotation_[^&]+|
2861 x-yt-cl=[0-9]+|
2862 hl=[^&]*|
2863 t=[0-9]+
2864 )?
2865 |
2866 attribution_link\?a=[^&]+
2867 )
2868 $
2869 '''
2870
2871 _TESTS = [{
2872 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
2873 'only_matching': True,
2874 }, {
2875 'url': 'https://www.youtube.com/watch?',
2876 'only_matching': True,
2877 }, {
2878 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
2879 'only_matching': True,
2880 }, {
2881 'url': 'https://www.youtube.com/watch?feature=foo',
2882 'only_matching': True,
2883 }, {
2884 'url': 'https://www.youtube.com/watch?hl=en-GB',
2885 'only_matching': True,
2886 }, {
2887 'url': 'https://www.youtube.com/watch?t=2372',
2888 'only_matching': True,
2889 }]
2890
2891 def _real_extract(self, url):
2892 raise ExtractorError(
2893 'Did you forget to quote the URL? Remember that & is a meta '
2894 'character in most shells, so you want to put the URL in quotes, '
2895 'like youtube-dl '
2896 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
2897 ' or simply youtube-dl BaW_jenozKc .',
2898 expected=True)
2899
2900
2901 class YoutubeTruncatedIDIE(InfoExtractor):
2902 IE_NAME = 'youtube:truncated_id'
2903 IE_DESC = False # Do not list
2904 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
2905
2906 _TESTS = [{
2907 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
2908 'only_matching': True,
2909 }]
2910
2911 def _real_extract(self, url):
2912 video_id = self._match_id(url)
2913 raise ExtractorError(
2914 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
2915 expected=True)