]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/youtube.py
[youtube] Remove info el for get_video_info request
[yt-dlp.git] / youtube_dl / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5
6 import itertools
7 import json
8 import os.path
9 import random
10 import re
11 import time
12 import traceback
13
14 from .common import InfoExtractor, SearchInfoExtractor
15 from ..jsinterp import JSInterpreter
16 from ..swfinterp import SWFInterpreter
17 from ..compat import (
18 compat_chr,
19 compat_kwargs,
20 compat_parse_qs,
21 compat_urllib_parse_unquote,
22 compat_urllib_parse_unquote_plus,
23 compat_urllib_parse_urlencode,
24 compat_urllib_parse_urlparse,
25 compat_urlparse,
26 compat_str,
27 )
28 from ..utils import (
29 clean_html,
30 dict_get,
31 error_to_compat_str,
32 ExtractorError,
33 float_or_none,
34 get_element_by_attribute,
35 get_element_by_id,
36 int_or_none,
37 mimetype2ext,
38 orderedSet,
39 parse_codecs,
40 parse_duration,
41 qualities,
42 remove_quotes,
43 remove_start,
44 smuggle_url,
45 str_or_none,
46 str_to_int,
47 try_get,
48 unescapeHTML,
49 unified_strdate,
50 unsmuggle_url,
51 uppercase_escape,
52 url_or_none,
53 urlencode_postdata,
54 )
55
56
57 class YoutubeBaseInfoExtractor(InfoExtractor):
58 """Provide base functions for Youtube extractors"""
59 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
60 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
61
62 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
63 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
64 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
65
66 _NETRC_MACHINE = 'youtube'
67 # If True it will raise an error if no login info is provided
68 _LOGIN_REQUIRED = False
69
70 _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)[0-9A-Za-z-_]{10,}'
71
72 def _set_language(self):
73 self._set_cookie(
74 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
75 # YouTube sets the expire time to about two months
76 expire_time=time.time() + 2 * 30 * 24 * 3600)
77
78 def _ids_to_results(self, ids):
79 return [
80 self.url_result(vid_id, 'Youtube', video_id=vid_id)
81 for vid_id in ids]
82
83 def _login(self):
84 """
85 Attempt to log in to YouTube.
86 True is returned if successful or skipped.
87 False is returned if login failed.
88
89 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
90 """
91 username, password = self._get_login_info()
92 # No authentication to be performed
93 if username is None:
94 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
95 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
96 return True
97
98 login_page = self._download_webpage(
99 self._LOGIN_URL, None,
100 note='Downloading login page',
101 errnote='unable to fetch login page', fatal=False)
102 if login_page is False:
103 return
104
105 login_form = self._hidden_inputs(login_page)
106
107 def req(url, f_req, note, errnote):
108 data = login_form.copy()
109 data.update({
110 'pstMsg': 1,
111 'checkConnection': 'youtube',
112 'checkedDomains': 'youtube',
113 'hl': 'en',
114 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
115 'f.req': json.dumps(f_req),
116 'flowName': 'GlifWebSignIn',
117 'flowEntry': 'ServiceLogin',
118 })
119 return self._download_json(
120 url, None, note=note, errnote=errnote,
121 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
122 fatal=False,
123 data=urlencode_postdata(data), headers={
124 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
125 'Google-Accounts-XSRF': 1,
126 })
127
128 def warn(message):
129 self._downloader.report_warning(message)
130
131 lookup_req = [
132 username,
133 None, [], None, 'US', None, None, 2, False, True,
134 [
135 None, None,
136 [2, 1, None, 1,
137 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
138 None, [], 4],
139 1, [None, None, []], None, None, None, True
140 ],
141 username,
142 ]
143
144 lookup_results = req(
145 self._LOOKUP_URL, lookup_req,
146 'Looking up account info', 'Unable to look up account info')
147
148 if lookup_results is False:
149 return False
150
151 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
152 if not user_hash:
153 warn('Unable to extract user hash')
154 return False
155
156 challenge_req = [
157 user_hash,
158 None, 1, None, [1, None, None, None, [password, None, True]],
159 [
160 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
161 1, [None, None, []], None, None, None, True
162 ]]
163
164 challenge_results = req(
165 self._CHALLENGE_URL, challenge_req,
166 'Logging in', 'Unable to log in')
167
168 if challenge_results is False:
169 return
170
171 login_res = try_get(challenge_results, lambda x: x[0][5], list)
172 if login_res:
173 login_msg = try_get(login_res, lambda x: x[5], compat_str)
174 warn(
175 'Unable to login: %s' % 'Invalid password'
176 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
177 return False
178
179 res = try_get(challenge_results, lambda x: x[0][-1], list)
180 if not res:
181 warn('Unable to extract result entry')
182 return False
183
184 login_challenge = try_get(res, lambda x: x[0][0], list)
185 if login_challenge:
186 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
187 if challenge_str == 'TWO_STEP_VERIFICATION':
188 # SEND_SUCCESS - TFA code has been successfully sent to phone
189 # QUOTA_EXCEEDED - reached the limit of TFA codes
190 status = try_get(login_challenge, lambda x: x[5], compat_str)
191 if status == 'QUOTA_EXCEEDED':
192 warn('Exceeded the limit of TFA codes, try later')
193 return False
194
195 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
196 if not tl:
197 warn('Unable to extract TL')
198 return False
199
200 tfa_code = self._get_tfa_info('2-step verification code')
201
202 if not tfa_code:
203 warn(
204 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
205 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
206 return False
207
208 tfa_code = remove_start(tfa_code, 'G-')
209
210 tfa_req = [
211 user_hash, None, 2, None,
212 [
213 9, None, None, None, None, None, None, None,
214 [None, tfa_code, True, 2]
215 ]]
216
217 tfa_results = req(
218 self._TFA_URL.format(tl), tfa_req,
219 'Submitting TFA code', 'Unable to submit TFA code')
220
221 if tfa_results is False:
222 return False
223
224 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
225 if tfa_res:
226 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
227 warn(
228 'Unable to finish TFA: %s' % 'Invalid TFA code'
229 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
230 return False
231
232 check_cookie_url = try_get(
233 tfa_results, lambda x: x[0][-1][2], compat_str)
234 else:
235 CHALLENGES = {
236 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
237 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
238 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
239 }
240 challenge = CHALLENGES.get(
241 challenge_str,
242 '%s returned error %s.' % (self.IE_NAME, challenge_str))
243 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
244 return False
245 else:
246 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
247
248 if not check_cookie_url:
249 warn('Unable to extract CheckCookie URL')
250 return False
251
252 check_cookie_results = self._download_webpage(
253 check_cookie_url, None, 'Checking cookie', fatal=False)
254
255 if check_cookie_results is False:
256 return False
257
258 if 'https://myaccount.google.com/' not in check_cookie_results:
259 warn('Unable to log in')
260 return False
261
262 return True
263
264 def _download_webpage_handle(self, *args, **kwargs):
265 query = kwargs.get('query', {}).copy()
266 query['disable_polymer'] = 'true'
267 kwargs['query'] = query
268 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
269 *args, **compat_kwargs(kwargs))
270
271 def _real_initialize(self):
272 if self._downloader is None:
273 return
274 self._set_language()
275 if not self._login():
276 return
277
278
279 class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
280 # Extract entries from page with "Load more" button
281 def _entries(self, page, playlist_id):
282 more_widget_html = content_html = page
283 for page_num in itertools.count(1):
284 for entry in self._process_page(content_html):
285 yield entry
286
287 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
288 if not mobj:
289 break
290
291 more = self._download_json(
292 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
293 'Downloading page #%s' % page_num,
294 transform_source=uppercase_escape)
295 content_html = more['content_html']
296 if not content_html.strip():
297 # Some webpages show a "Load more" button but they don't
298 # have more videos
299 break
300 more_widget_html = more['load_more_widget_html']
301
302
303 class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
304 def _process_page(self, content):
305 for video_id, video_title in self.extract_videos_from_page(content):
306 yield self.url_result(video_id, 'Youtube', video_id, video_title)
307
308 def extract_videos_from_page(self, page):
309 ids_in_page = []
310 titles_in_page = []
311 for mobj in re.finditer(self._VIDEO_RE, page):
312 # The link with index 0 is not the first video of the playlist (not sure if still actual)
313 if 'index' in mobj.groupdict() and mobj.group('id') == '0':
314 continue
315 video_id = mobj.group('id')
316 video_title = unescapeHTML(mobj.group('title'))
317 if video_title:
318 video_title = video_title.strip()
319 try:
320 idx = ids_in_page.index(video_id)
321 if video_title and not titles_in_page[idx]:
322 titles_in_page[idx] = video_title
323 except ValueError:
324 ids_in_page.append(video_id)
325 titles_in_page.append(video_title)
326 return zip(ids_in_page, titles_in_page)
327
328
329 class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
330 def _process_page(self, content):
331 for playlist_id in orderedSet(re.findall(
332 r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',
333 content)):
334 yield self.url_result(
335 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
336
337 def _real_extract(self, url):
338 playlist_id = self._match_id(url)
339 webpage = self._download_webpage(url, playlist_id)
340 title = self._og_search_title(webpage, fatal=False)
341 return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
342
343
344 class YoutubeIE(YoutubeBaseInfoExtractor):
345 IE_DESC = 'YouTube.com'
346 _VALID_URL = r"""(?x)^
347 (
348 (?:https?://|//) # http(s):// or protocol-independent URL
349 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
350 (?:www\.)?deturl\.com/www\.youtube\.com/|
351 (?:www\.)?pwnyoutube\.com/|
352 (?:www\.)?hooktube\.com/|
353 (?:www\.)?yourepeat\.com/|
354 tube\.majestyc\.net/|
355 (?:(?:www|dev)\.)?invidio\.us/|
356 (?:www\.)?invidiou\.sh/|
357 (?:www\.)?invidious\.snopyta\.org/|
358 (?:www\.)?invidious\.kabi\.tk/|
359 (?:www\.)?vid\.wxzm\.sx/|
360 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
361 (?:.*?\#/)? # handle anchor (#/) redirect urls
362 (?: # the various things that can precede the ID:
363 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
364 |(?: # or the v= param in all its forms
365 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
366 (?:\?|\#!?) # the params delimiter ? or # or #!
367 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
368 v=
369 )
370 ))
371 |(?:
372 youtu\.be| # just youtu.be/xxxx
373 vid\.plus| # or vid.plus/xxxx
374 zwearz\.com/watch| # or zwearz.com/watch/xxxx
375 )/
376 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
377 )
378 )? # all until now is optional -> you can pass the naked ID
379 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
380 (?!.*?\blist=
381 (?:
382 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
383 WL # WL are handled by the watch later IE
384 )
385 )
386 (?(1).+)? # if we found the ID, everything can follow
387 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
388 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
389 _formats = {
390 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
391 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
392 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
393 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
394 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
395 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
396 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
397 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
398 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
399 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
400 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
401 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
402 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
403 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
404 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
405 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
406 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
407 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
408
409
410 # 3D videos
411 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
412 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
413 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
414 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
415 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
416 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
417 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
418
419 # Apple HTTP Live Streaming
420 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
421 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
422 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
423 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
424 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
425 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
426 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
427 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
428
429 # DASH mp4 video
430 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
431 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
432 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
433 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
434 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
435 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
436 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
437 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
438 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
439 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
440 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
441 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
442
443 # Dash mp4 audio
444 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
445 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
446 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
447 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
448 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
449 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
450 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
451
452 # Dash webm
453 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
454 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
455 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
456 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
457 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
458 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
459 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
460 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
461 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
462 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
463 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
464 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
465 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
466 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
467 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
468 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
469 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
470 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
471 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
472 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
473 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
474 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
475
476 # Dash webm audio
477 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
478 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
479
480 # Dash webm audio with opus inside
481 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
482 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
483 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
484
485 # RTMP (unnamed)
486 '_rtmp': {'protocol': 'rtmp'},
487 }
488 _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
489
490 _GEO_BYPASS = False
491
492 IE_NAME = 'youtube'
493 _TESTS = [
494 {
495 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
496 'info_dict': {
497 'id': 'BaW_jenozKc',
498 'ext': 'mp4',
499 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
500 'uploader': 'Philipp Hagemeister',
501 'uploader_id': 'phihag',
502 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
503 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
504 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
505 'upload_date': '20121002',
506 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
507 'categories': ['Science & Technology'],
508 'tags': ['youtube-dl'],
509 'duration': 10,
510 'view_count': int,
511 'like_count': int,
512 'dislike_count': int,
513 'start_time': 1,
514 'end_time': 9,
515 }
516 },
517 {
518 'url': 'https://www.youtube.com/watch?v=UxxajLWwzqY',
519 'note': 'Test generic use_cipher_signature video (#897)',
520 'info_dict': {
521 'id': 'UxxajLWwzqY',
522 'ext': 'mp4',
523 'upload_date': '20120506',
524 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
525 'alt_title': 'I Love It (feat. Charli XCX)',
526 'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8',
527 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
528 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
529 'iconic ep', 'iconic', 'love', 'it'],
530 'duration': 180,
531 'uploader': 'Icona Pop',
532 'uploader_id': 'IconaPop',
533 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop',
534 'creator': 'Icona Pop',
535 'track': 'I Love It (feat. Charli XCX)',
536 'artist': 'Icona Pop',
537 }
538 },
539 {
540 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
541 'note': 'Test VEVO video with age protection (#956)',
542 'info_dict': {
543 'id': '07FYdnEawAQ',
544 'ext': 'mp4',
545 'upload_date': '20130703',
546 'title': 'Justin Timberlake - Tunnel Vision (Official Music Video) (Explicit)',
547 'alt_title': 'Tunnel Vision',
548 'description': 'md5:07dab3356cde4199048e4c7cd93471e1',
549 'duration': 419,
550 'uploader': 'justintimberlakeVEVO',
551 'uploader_id': 'justintimberlakeVEVO',
552 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',
553 'creator': 'Justin Timberlake',
554 'track': 'Tunnel Vision',
555 'artist': 'Justin Timberlake',
556 'age_limit': 18,
557 }
558 },
559 {
560 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
561 'note': 'Embed-only video (#1746)',
562 'info_dict': {
563 'id': 'yZIXLfi8CZQ',
564 'ext': 'mp4',
565 'upload_date': '20120608',
566 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
567 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
568 'uploader': 'SET India',
569 'uploader_id': 'setindia',
570 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
571 'age_limit': 18,
572 }
573 },
574 {
575 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
576 'note': 'Use the first video ID in the URL',
577 'info_dict': {
578 'id': 'BaW_jenozKc',
579 'ext': 'mp4',
580 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
581 'uploader': 'Philipp Hagemeister',
582 'uploader_id': 'phihag',
583 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
584 'upload_date': '20121002',
585 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
586 'categories': ['Science & Technology'],
587 'tags': ['youtube-dl'],
588 'duration': 10,
589 'view_count': int,
590 'like_count': int,
591 'dislike_count': int,
592 },
593 'params': {
594 'skip_download': True,
595 },
596 },
597 {
598 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
599 'note': '256k DASH audio (format 141) via DASH manifest',
600 'info_dict': {
601 'id': 'a9LDPn-MO4I',
602 'ext': 'm4a',
603 'upload_date': '20121002',
604 'uploader_id': '8KVIDEO',
605 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
606 'description': '',
607 'uploader': '8KVIDEO',
608 'title': 'UHDTV TEST 8K VIDEO.mp4'
609 },
610 'params': {
611 'youtube_include_dash_manifest': True,
612 'format': '141',
613 },
614 'skip': 'format 141 not served anymore',
615 },
616 # DASH manifest with encrypted signature
617 {
618 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
619 'info_dict': {
620 'id': 'IB3lcPjvWLA',
621 'ext': 'm4a',
622 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
623 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
624 'duration': 244,
625 'uploader': 'AfrojackVEVO',
626 'uploader_id': 'AfrojackVEVO',
627 'upload_date': '20131011',
628 },
629 'params': {
630 'youtube_include_dash_manifest': True,
631 'format': '141/bestaudio[ext=m4a]',
632 },
633 },
634 # JS player signature function name containing $
635 {
636 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
637 'info_dict': {
638 'id': 'nfWlot6h_JM',
639 'ext': 'm4a',
640 'title': 'Taylor Swift - Shake It Off',
641 'description': 'md5:bec2185232c05479482cb5a9b82719bf',
642 'duration': 242,
643 'uploader': 'TaylorSwiftVEVO',
644 'uploader_id': 'TaylorSwiftVEVO',
645 'upload_date': '20140818',
646 'creator': 'Taylor Swift',
647 },
648 'params': {
649 'youtube_include_dash_manifest': True,
650 'format': '141/bestaudio[ext=m4a]',
651 },
652 },
653 # Controversy video
654 {
655 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
656 'info_dict': {
657 'id': 'T4XJQO3qol8',
658 'ext': 'mp4',
659 'duration': 219,
660 'upload_date': '20100909',
661 'uploader': 'Amazing Atheist',
662 'uploader_id': 'TheAmazingAtheist',
663 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
664 'title': 'Burning Everyone\'s Koran',
665 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
666 }
667 },
668 # Normal age-gate video (No vevo, embed allowed)
669 {
670 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
671 'info_dict': {
672 'id': 'HtVdAasjOgU',
673 'ext': 'mp4',
674 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
675 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
676 'duration': 142,
677 'uploader': 'The Witcher',
678 'uploader_id': 'WitcherGame',
679 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
680 'upload_date': '20140605',
681 'age_limit': 18,
682 },
683 },
684 # Age-gate video with encrypted signature
685 {
686 'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU',
687 'info_dict': {
688 'id': '6kLq3WMV1nU',
689 'ext': 'mp4',
690 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
691 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
692 'duration': 246,
693 'uploader': 'LloydVEVO',
694 'uploader_id': 'LloydVEVO',
695 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO',
696 'upload_date': '20110629',
697 'age_limit': 18,
698 },
699 },
700 # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
701 # YouTube Red ad is not captured for creator
702 {
703 'url': '__2ABJjxzNo',
704 'info_dict': {
705 'id': '__2ABJjxzNo',
706 'ext': 'mp4',
707 'duration': 266,
708 'upload_date': '20100430',
709 'uploader_id': 'deadmau5',
710 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
711 'creator': 'deadmau5',
712 'description': 'md5:12c56784b8032162bb936a5f76d55360',
713 'uploader': 'deadmau5',
714 'title': 'Deadmau5 - Some Chords (HD)',
715 'alt_title': 'Some Chords',
716 },
717 'expected_warnings': [
718 'DASH manifest missing',
719 ]
720 },
721 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
722 {
723 'url': 'lqQg6PlCWgI',
724 'info_dict': {
725 'id': 'lqQg6PlCWgI',
726 'ext': 'mp4',
727 'duration': 6085,
728 'upload_date': '20150827',
729 'uploader_id': 'olympic',
730 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
731 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
732 'uploader': 'Olympic',
733 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
734 },
735 'params': {
736 'skip_download': 'requires avconv',
737 }
738 },
739 # Non-square pixels
740 {
741 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
742 'info_dict': {
743 'id': '_b-2C3KPAM0',
744 'ext': 'mp4',
745 'stretched_ratio': 16 / 9.,
746 'duration': 85,
747 'upload_date': '20110310',
748 'uploader_id': 'AllenMeow',
749 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
750 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
751 'uploader': '孫ᄋᄅ',
752 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
753 },
754 },
755 # url_encoded_fmt_stream_map is empty string
756 {
757 'url': 'qEJwOuvDf7I',
758 'info_dict': {
759 'id': 'qEJwOuvDf7I',
760 'ext': 'webm',
761 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
762 'description': '',
763 'upload_date': '20150404',
764 'uploader_id': 'spbelect',
765 'uploader': 'Наблюдатели Петербурга',
766 },
767 'params': {
768 'skip_download': 'requires avconv',
769 },
770 'skip': 'This live event has ended.',
771 },
772 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
773 {
774 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
775 'info_dict': {
776 'id': 'FIl7x6_3R5Y',
777 'ext': 'webm',
778 'title': 'md5:7b81415841e02ecd4313668cde88737a',
779 'description': 'md5:116377fd2963b81ec4ce64b542173306',
780 'duration': 220,
781 'upload_date': '20150625',
782 'uploader_id': 'dorappi2000',
783 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
784 'uploader': 'dorappi2000',
785 'formats': 'mincount:31',
786 },
787 'skip': 'not actual anymore',
788 },
789 # DASH manifest with segment_list
790 {
791 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
792 'md5': '8ce563a1d667b599d21064e982ab9e31',
793 'info_dict': {
794 'id': 'CsmdDsKjzN8',
795 'ext': 'mp4',
796 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
797 'uploader': 'Airtek',
798 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
799 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
800 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
801 },
802 'params': {
803 'youtube_include_dash_manifest': True,
804 'format': '135', # bestvideo
805 },
806 'skip': 'This live event has ended.',
807 },
808 {
809 # Multifeed videos (multiple cameras), URL is for Main Camera
810 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
811 'info_dict': {
812 'id': 'jqWvoWXjCVs',
813 'title': 'teamPGP: Rocket League Noob Stream',
814 'description': 'md5:dc7872fb300e143831327f1bae3af010',
815 },
816 'playlist': [{
817 'info_dict': {
818 'id': 'jqWvoWXjCVs',
819 'ext': 'mp4',
820 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
821 'description': 'md5:dc7872fb300e143831327f1bae3af010',
822 'duration': 7335,
823 'upload_date': '20150721',
824 'uploader': 'Beer Games Beer',
825 'uploader_id': 'beergamesbeer',
826 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
827 'license': 'Standard YouTube License',
828 },
829 }, {
830 'info_dict': {
831 'id': '6h8e8xoXJzg',
832 'ext': 'mp4',
833 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
834 'description': 'md5:dc7872fb300e143831327f1bae3af010',
835 'duration': 7337,
836 'upload_date': '20150721',
837 'uploader': 'Beer Games Beer',
838 'uploader_id': 'beergamesbeer',
839 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
840 'license': 'Standard YouTube License',
841 },
842 }, {
843 'info_dict': {
844 'id': 'PUOgX5z9xZw',
845 'ext': 'mp4',
846 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
847 'description': 'md5:dc7872fb300e143831327f1bae3af010',
848 'duration': 7337,
849 'upload_date': '20150721',
850 'uploader': 'Beer Games Beer',
851 'uploader_id': 'beergamesbeer',
852 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
853 'license': 'Standard YouTube License',
854 },
855 }, {
856 'info_dict': {
857 'id': 'teuwxikvS5k',
858 'ext': 'mp4',
859 'title': 'teamPGP: Rocket League Noob Stream (zim)',
860 'description': 'md5:dc7872fb300e143831327f1bae3af010',
861 'duration': 7334,
862 'upload_date': '20150721',
863 'uploader': 'Beer Games Beer',
864 'uploader_id': 'beergamesbeer',
865 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
866 'license': 'Standard YouTube License',
867 },
868 }],
869 'params': {
870 'skip_download': True,
871 },
872 'skip': 'This video is not available.',
873 },
874 {
875 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
876 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
877 'info_dict': {
878 'id': 'gVfLd0zydlo',
879 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
880 },
881 'playlist_count': 2,
882 'skip': 'Not multifeed anymore',
883 },
884 {
885 'url': 'https://vid.plus/FlRa-iH7PGw',
886 'only_matching': True,
887 },
888 {
889 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
890 'only_matching': True,
891 },
892 {
893 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
894 # Also tests cut-off URL expansion in video description (see
895 # https://github.com/ytdl-org/youtube-dl/issues/1892,
896 # https://github.com/ytdl-org/youtube-dl/issues/8164)
897 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
898 'info_dict': {
899 'id': 'lsguqyKfVQg',
900 'ext': 'mp4',
901 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
902 'alt_title': 'Dark Walk - Position Music',
903 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
904 'duration': 133,
905 'upload_date': '20151119',
906 'uploader_id': 'IronSoulElf',
907 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
908 'uploader': 'IronSoulElf',
909 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
910 'track': 'Dark Walk - Position Music',
911 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
912 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
913 },
914 'params': {
915 'skip_download': True,
916 },
917 },
918 {
919 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
920 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
921 'only_matching': True,
922 },
923 {
924 # Video with yt:stretch=17:0
925 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
926 'info_dict': {
927 'id': 'Q39EVAstoRM',
928 'ext': 'mp4',
929 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
930 'description': 'md5:ee18a25c350637c8faff806845bddee9',
931 'upload_date': '20151107',
932 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
933 'uploader': 'CH GAMER DROID',
934 },
935 'params': {
936 'skip_download': True,
937 },
938 'skip': 'This video does not exist.',
939 },
940 {
941 # Video licensed under Creative Commons
942 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
943 'info_dict': {
944 'id': 'M4gD1WSo5mA',
945 'ext': 'mp4',
946 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
947 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
948 'duration': 721,
949 'upload_date': '20150127',
950 'uploader_id': 'BerkmanCenter',
951 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
952 'uploader': 'The Berkman Klein Center for Internet & Society',
953 'license': 'Creative Commons Attribution license (reuse allowed)',
954 },
955 'params': {
956 'skip_download': True,
957 },
958 },
959 {
960 # Channel-like uploader_url
961 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
962 'info_dict': {
963 'id': 'eQcmzGIKrzg',
964 'ext': 'mp4',
965 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
966 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
967 'duration': 4060,
968 'upload_date': '20151119',
969 'uploader': 'Bernie Sanders',
970 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
971 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
972 'license': 'Creative Commons Attribution license (reuse allowed)',
973 },
974 'params': {
975 'skip_download': True,
976 },
977 },
978 {
979 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
980 'only_matching': True,
981 },
982 {
983 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
984 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
985 'only_matching': True,
986 },
987 {
988 # Rental video preview
989 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
990 'info_dict': {
991 'id': 'uGpuVWrhIzE',
992 'ext': 'mp4',
993 'title': 'Piku - Trailer',
994 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
995 'upload_date': '20150811',
996 'uploader': 'FlixMatrix',
997 'uploader_id': 'FlixMatrixKaravan',
998 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
999 'license': 'Standard YouTube License',
1000 },
1001 'params': {
1002 'skip_download': True,
1003 },
1004 'skip': 'This video is not available.',
1005 },
1006 {
1007 # YouTube Red video with episode data
1008 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
1009 'info_dict': {
1010 'id': 'iqKdEhx-dD4',
1011 'ext': 'mp4',
1012 'title': 'Isolation - Mind Field (Ep 1)',
1013 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
1014 'duration': 2085,
1015 'upload_date': '20170118',
1016 'uploader': 'Vsauce',
1017 'uploader_id': 'Vsauce',
1018 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
1019 'series': 'Mind Field',
1020 'season_number': 1,
1021 'episode_number': 1,
1022 },
1023 'params': {
1024 'skip_download': True,
1025 },
1026 'expected_warnings': [
1027 'Skipping DASH manifest',
1028 ],
1029 },
1030 {
1031 # The following content has been identified by the YouTube community
1032 # as inappropriate or offensive to some audiences.
1033 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
1034 'info_dict': {
1035 'id': '6SJNVb0GnPI',
1036 'ext': 'mp4',
1037 'title': 'Race Differences in Intelligence',
1038 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
1039 'duration': 965,
1040 'upload_date': '20140124',
1041 'uploader': 'New Century Foundation',
1042 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
1043 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
1044 },
1045 'params': {
1046 'skip_download': True,
1047 },
1048 },
1049 {
1050 # itag 212
1051 'url': '1t24XAntNCY',
1052 'only_matching': True,
1053 },
1054 {
1055 # geo restricted to JP
1056 'url': 'sJL6WA-aGkQ',
1057 'only_matching': True,
1058 },
1059 {
1060 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
1061 'only_matching': True,
1062 },
1063 {
1064 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1065 'only_matching': True,
1066 },
1067 {
1068 # DRM protected
1069 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1070 'only_matching': True,
1071 },
1072 {
1073 # Video with unsupported adaptive stream type formats
1074 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1075 'info_dict': {
1076 'id': 'Z4Vy8R84T1U',
1077 'ext': 'mp4',
1078 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1079 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1080 'duration': 433,
1081 'upload_date': '20130923',
1082 'uploader': 'Amelia Putri Harwita',
1083 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1084 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1085 'formats': 'maxcount:10',
1086 },
1087 'params': {
1088 'skip_download': True,
1089 'youtube_include_dash_manifest': False,
1090 },
1091 },
1092 {
1093 # Youtube Music Auto-generated description
1094 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1095 'info_dict': {
1096 'id': 'MgNrAu2pzNs',
1097 'ext': 'mp4',
1098 'title': 'Voyeur Girl',
1099 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1100 'upload_date': '20190312',
1101 'uploader': 'Various Artists - Topic',
1102 'uploader_id': 'UCVWKBi1ELZn0QX2CBLSkiyw',
1103 'artist': 'Stephen',
1104 'track': 'Voyeur Girl',
1105 'album': 'it\'s too much love to know my dear',
1106 'release_date': '20190313',
1107 'release_year': 2019,
1108 },
1109 'params': {
1110 'skip_download': True,
1111 },
1112 },
1113 {
1114 # Youtube Music Auto-generated description
1115 # Retrieve 'artist' field from 'Artist:' in video description
1116 # when it is present on youtube music video
1117 'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY',
1118 'info_dict': {
1119 'id': 'k0jLE7tTwjY',
1120 'ext': 'mp4',
1121 'title': 'Latch Feat. Sam Smith',
1122 'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335',
1123 'upload_date': '20150110',
1124 'uploader': 'Various Artists - Topic',
1125 'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w',
1126 'artist': 'Disclosure',
1127 'track': 'Latch Feat. Sam Smith',
1128 'album': 'Latch Featuring Sam Smith',
1129 'release_date': '20121008',
1130 'release_year': 2012,
1131 },
1132 'params': {
1133 'skip_download': True,
1134 },
1135 },
1136 {
1137 # Youtube Music Auto-generated description
1138 # handle multiple artists on youtube music video
1139 'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA',
1140 'info_dict': {
1141 'id': '74qn0eJSjpA',
1142 'ext': 'mp4',
1143 'title': 'Eastside',
1144 'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2',
1145 'upload_date': '20180710',
1146 'uploader': 'Benny Blanco - Topic',
1147 'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A',
1148 'artist': 'benny blanco, Halsey, Khalid',
1149 'track': 'Eastside',
1150 'album': 'Eastside',
1151 'release_date': '20180713',
1152 'release_year': 2018,
1153 },
1154 'params': {
1155 'skip_download': True,
1156 },
1157 },
1158 {
1159 # Youtube Music Auto-generated description
1160 # handle youtube music video with release_year and no release_date
1161 'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M',
1162 'info_dict': {
1163 'id': '-hcAI0g-f5M',
1164 'ext': 'mp4',
1165 'title': 'Put It On Me',
1166 'description': 'md5:93c55acc682ae7b0c668f2e34e1c069e',
1167 'upload_date': '20180426',
1168 'uploader': 'Matt Maeson - Topic',
1169 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ',
1170 'artist': 'Matt Maeson',
1171 'track': 'Put It On Me',
1172 'album': 'The Hearse',
1173 'release_date': None,
1174 'release_year': 2018,
1175 },
1176 'params': {
1177 'skip_download': True,
1178 },
1179 },
1180 ]
1181
1182 def __init__(self, *args, **kwargs):
1183 super(YoutubeIE, self).__init__(*args, **kwargs)
1184 self._player_cache = {}
1185
1186 def report_video_info_webpage_download(self, video_id):
1187 """Report attempt to download video info webpage."""
1188 self.to_screen('%s: Downloading video info webpage' % video_id)
1189
1190 def report_information_extraction(self, video_id):
1191 """Report attempt to extract video information."""
1192 self.to_screen('%s: Extracting video information' % video_id)
1193
1194 def report_unavailable_format(self, video_id, format):
1195 """Report extracted video URL."""
1196 self.to_screen('%s: Format %s not available' % (video_id, format))
1197
1198 def report_rtmp_download(self):
1199 """Indicate the download will use the RTMP protocol."""
1200 self.to_screen('RTMP download detected')
1201
1202 def _signature_cache_id(self, example_sig):
1203 """ Return a string representation of a signature """
1204 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
1205
1206 def _extract_signature_function(self, video_id, player_url, example_sig):
1207 id_m = re.match(
1208 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2,3}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$',
1209 player_url)
1210 if not id_m:
1211 raise ExtractorError('Cannot identify player %r' % player_url)
1212 player_type = id_m.group('ext')
1213 player_id = id_m.group('id')
1214
1215 # Read from filesystem cache
1216 func_id = '%s_%s_%s' % (
1217 player_type, player_id, self._signature_cache_id(example_sig))
1218 assert os.path.basename(func_id) == func_id
1219
1220 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
1221 if cache_spec is not None:
1222 return lambda s: ''.join(s[i] for i in cache_spec)
1223
1224 download_note = (
1225 'Downloading player %s' % player_url
1226 if self._downloader.params.get('verbose') else
1227 'Downloading %s player %s' % (player_type, player_id)
1228 )
1229 if player_type == 'js':
1230 code = self._download_webpage(
1231 player_url, video_id,
1232 note=download_note,
1233 errnote='Download of %s failed' % player_url)
1234 res = self._parse_sig_js(code)
1235 elif player_type == 'swf':
1236 urlh = self._request_webpage(
1237 player_url, video_id,
1238 note=download_note,
1239 errnote='Download of %s failed' % player_url)
1240 code = urlh.read()
1241 res = self._parse_sig_swf(code)
1242 else:
1243 assert False, 'Invalid player type %r' % player_type
1244
1245 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1246 cache_res = res(test_string)
1247 cache_spec = [ord(c) for c in cache_res]
1248
1249 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
1250 return res
1251
1252 def _print_sig_code(self, func, example_sig):
1253 def gen_sig_code(idxs):
1254 def _genslice(start, end, step):
1255 starts = '' if start == 0 else str(start)
1256 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
1257 steps = '' if step == 1 else (':%d' % step)
1258 return 's[%s%s%s]' % (starts, ends, steps)
1259
1260 step = None
1261 # Quelch pyflakes warnings - start will be set when step is set
1262 start = '(Never used)'
1263 for i, prev in zip(idxs[1:], idxs[:-1]):
1264 if step is not None:
1265 if i - prev == step:
1266 continue
1267 yield _genslice(start, prev, step)
1268 step = None
1269 continue
1270 if i - prev in [-1, 1]:
1271 step = i - prev
1272 start = prev
1273 continue
1274 else:
1275 yield 's[%d]' % prev
1276 if step is None:
1277 yield 's[%d]' % i
1278 else:
1279 yield _genslice(start, i, step)
1280
1281 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1282 cache_res = func(test_string)
1283 cache_spec = [ord(c) for c in cache_res]
1284 expr_code = ' + '.join(gen_sig_code(cache_spec))
1285 signature_id_tuple = '(%s)' % (
1286 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
1287 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
1288 ' return %s\n') % (signature_id_tuple, expr_code)
1289 self.to_screen('Extracted signature function:\n' + code)
1290
1291 def _parse_sig_js(self, jscode):
1292 funcname = self._search_regex(
1293 (r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1294 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
1295 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?(?P<sig>[a-zA-Z0-9$]+)\(',
1296 r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1297 r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
1298 jscode, 'Initial JS player signature function name', group='sig')
1299
1300 jsi = JSInterpreter(jscode)
1301 initial_function = jsi.extract_function(funcname)
1302 return lambda s: initial_function([s])
1303
1304 def _parse_sig_swf(self, file_contents):
1305 swfi = SWFInterpreter(file_contents)
1306 TARGET_CLASSNAME = 'SignatureDecipher'
1307 searched_class = swfi.extract_class(TARGET_CLASSNAME)
1308 initial_function = swfi.extract_function(searched_class, 'decipher')
1309 return lambda s: initial_function([s])
1310
1311 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1312 """Turn the encrypted s field into a working signature"""
1313
1314 if player_url is None:
1315 raise ExtractorError('Cannot decrypt signature without player_url')
1316
1317 if player_url.startswith('//'):
1318 player_url = 'https:' + player_url
1319 elif not re.match(r'https?://', player_url):
1320 player_url = compat_urlparse.urljoin(
1321 'https://www.youtube.com', player_url)
1322 try:
1323 player_id = (player_url, self._signature_cache_id(s))
1324 if player_id not in self._player_cache:
1325 func = self._extract_signature_function(
1326 video_id, player_url, s
1327 )
1328 self._player_cache[player_id] = func
1329 func = self._player_cache[player_id]
1330 if self._downloader.params.get('youtube_print_sig_code'):
1331 self._print_sig_code(func, s)
1332 return func(s)
1333 except Exception as e:
1334 tb = traceback.format_exc()
1335 raise ExtractorError(
1336 'Signature extraction failed: ' + tb, cause=e)
1337
1338 def _get_subtitles(self, video_id, webpage):
1339 try:
1340 subs_doc = self._download_xml(
1341 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1342 video_id, note=False)
1343 except ExtractorError as err:
1344 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
1345 return {}
1346
1347 sub_lang_list = {}
1348 for track in subs_doc.findall('track'):
1349 lang = track.attrib['lang_code']
1350 if lang in sub_lang_list:
1351 continue
1352 sub_formats = []
1353 for ext in self._SUBTITLE_FORMATS:
1354 params = compat_urllib_parse_urlencode({
1355 'lang': lang,
1356 'v': video_id,
1357 'fmt': ext,
1358 'name': track.attrib['name'].encode('utf-8'),
1359 })
1360 sub_formats.append({
1361 'url': 'https://www.youtube.com/api/timedtext?' + params,
1362 'ext': ext,
1363 })
1364 sub_lang_list[lang] = sub_formats
1365 if not sub_lang_list:
1366 self._downloader.report_warning('video doesn\'t have subtitles')
1367 return {}
1368 return sub_lang_list
1369
1370 def _get_ytplayer_config(self, video_id, webpage):
1371 patterns = (
1372 # User data may contain arbitrary character sequences that may affect
1373 # JSON extraction with regex, e.g. when '};' is contained the second
1374 # regex won't capture the whole JSON. Yet working around by trying more
1375 # concrete regex first keeping in mind proper quoted string handling
1376 # to be implemented in future that will replace this workaround (see
1377 # https://github.com/ytdl-org/youtube-dl/issues/7468,
1378 # https://github.com/ytdl-org/youtube-dl/pull/7599)
1379 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1380 r';ytplayer\.config\s*=\s*({.+?});',
1381 )
1382 config = self._search_regex(
1383 patterns, webpage, 'ytplayer.config', default=None)
1384 if config:
1385 return self._parse_json(
1386 uppercase_escape(config), video_id, fatal=False)
1387
1388 def _get_automatic_captions(self, video_id, webpage):
1389 """We need the webpage for getting the captions url, pass it as an
1390 argument to speed up the process."""
1391 self.to_screen('%s: Looking for automatic captions' % video_id)
1392 player_config = self._get_ytplayer_config(video_id, webpage)
1393 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
1394 if not player_config:
1395 self._downloader.report_warning(err_msg)
1396 return {}
1397 try:
1398 args = player_config['args']
1399 caption_url = args.get('ttsurl')
1400 if caption_url:
1401 timestamp = args['timestamp']
1402 # We get the available subtitles
1403 list_params = compat_urllib_parse_urlencode({
1404 'type': 'list',
1405 'tlangs': 1,
1406 'asrs': 1,
1407 })
1408 list_url = caption_url + '&' + list_params
1409 caption_list = self._download_xml(list_url, video_id)
1410 original_lang_node = caption_list.find('track')
1411 if original_lang_node is None:
1412 self._downloader.report_warning('Video doesn\'t have automatic captions')
1413 return {}
1414 original_lang = original_lang_node.attrib['lang_code']
1415 caption_kind = original_lang_node.attrib.get('kind', '')
1416
1417 sub_lang_list = {}
1418 for lang_node in caption_list.findall('target'):
1419 sub_lang = lang_node.attrib['lang_code']
1420 sub_formats = []
1421 for ext in self._SUBTITLE_FORMATS:
1422 params = compat_urllib_parse_urlencode({
1423 'lang': original_lang,
1424 'tlang': sub_lang,
1425 'fmt': ext,
1426 'ts': timestamp,
1427 'kind': caption_kind,
1428 })
1429 sub_formats.append({
1430 'url': caption_url + '&' + params,
1431 'ext': ext,
1432 })
1433 sub_lang_list[sub_lang] = sub_formats
1434 return sub_lang_list
1435
1436 def make_captions(sub_url, sub_langs):
1437 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1438 caption_qs = compat_parse_qs(parsed_sub_url.query)
1439 captions = {}
1440 for sub_lang in sub_langs:
1441 sub_formats = []
1442 for ext in self._SUBTITLE_FORMATS:
1443 caption_qs.update({
1444 'tlang': [sub_lang],
1445 'fmt': [ext],
1446 })
1447 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1448 query=compat_urllib_parse_urlencode(caption_qs, True)))
1449 sub_formats.append({
1450 'url': sub_url,
1451 'ext': ext,
1452 })
1453 captions[sub_lang] = sub_formats
1454 return captions
1455
1456 # New captions format as of 22.06.2017
1457 player_response = args.get('player_response')
1458 if player_response and isinstance(player_response, compat_str):
1459 player_response = self._parse_json(
1460 player_response, video_id, fatal=False)
1461 if player_response:
1462 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1463 base_url = renderer['captionTracks'][0]['baseUrl']
1464 sub_lang_list = []
1465 for lang in renderer['translationLanguages']:
1466 lang_code = lang.get('languageCode')
1467 if lang_code:
1468 sub_lang_list.append(lang_code)
1469 return make_captions(base_url, sub_lang_list)
1470
1471 # Some videos don't provide ttsurl but rather caption_tracks and
1472 # caption_translation_languages (e.g. 20LmZk1hakA)
1473 # Does not used anymore as of 22.06.2017
1474 caption_tracks = args['caption_tracks']
1475 caption_translation_languages = args['caption_translation_languages']
1476 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
1477 sub_lang_list = []
1478 for lang in caption_translation_languages.split(','):
1479 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1480 sub_lang = lang_qs.get('lc', [None])[0]
1481 if sub_lang:
1482 sub_lang_list.append(sub_lang)
1483 return make_captions(caption_url, sub_lang_list)
1484 # An extractor error can be raise by the download process if there are
1485 # no automatic captions but there are subtitles
1486 except (KeyError, IndexError, ExtractorError):
1487 self._downloader.report_warning(err_msg)
1488 return {}
1489
1490 def _mark_watched(self, video_id, video_info, player_response):
1491 playback_url = url_or_none(try_get(
1492 player_response,
1493 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
1494 video_info, lambda x: x['videostats_playback_base_url'][0]))
1495 if not playback_url:
1496 return
1497 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1498 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1499
1500 # cpn generation algorithm is reverse engineered from base.js.
1501 # In fact it works even with dummy cpn.
1502 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1503 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1504
1505 qs.update({
1506 'ver': ['2'],
1507 'cpn': [cpn],
1508 })
1509 playback_url = compat_urlparse.urlunparse(
1510 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
1511
1512 self._download_webpage(
1513 playback_url, video_id, 'Marking watched',
1514 'Unable to mark watched', fatal=False)
1515
1516 @staticmethod
1517 def _extract_urls(webpage):
1518 # Embedded YouTube player
1519 entries = [
1520 unescapeHTML(mobj.group('url'))
1521 for mobj in re.finditer(r'''(?x)
1522 (?:
1523 <iframe[^>]+?src=|
1524 data-video-url=|
1525 <embed[^>]+?src=|
1526 embedSWF\(?:\s*|
1527 <object[^>]+data=|
1528 new\s+SWFObject\(
1529 )
1530 (["\'])
1531 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1532 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
1533 \1''', webpage)]
1534
1535 # lazyYT YouTube embed
1536 entries.extend(list(map(
1537 unescapeHTML,
1538 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1539
1540 # Wordpress "YouTube Video Importer" plugin
1541 matches = re.findall(r'''(?x)<div[^>]+
1542 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1543 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1544 entries.extend(m[-1] for m in matches)
1545
1546 return entries
1547
1548 @staticmethod
1549 def _extract_url(webpage):
1550 urls = YoutubeIE._extract_urls(webpage)
1551 return urls[0] if urls else None
1552
1553 @classmethod
1554 def extract_id(cls, url):
1555 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1556 if mobj is None:
1557 raise ExtractorError('Invalid URL: %s' % url)
1558 video_id = mobj.group(2)
1559 return video_id
1560
1561 def _extract_annotations(self, video_id):
1562 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1563 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1564
1565 @staticmethod
1566 def _extract_chapters(description, duration):
1567 if not description:
1568 return None
1569 chapter_lines = re.findall(
1570 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1571 description)
1572 if not chapter_lines:
1573 return None
1574 chapters = []
1575 for next_num, (chapter_line, time_point) in enumerate(
1576 chapter_lines, start=1):
1577 start_time = parse_duration(time_point)
1578 if start_time is None:
1579 continue
1580 if start_time > duration:
1581 break
1582 end_time = (duration if next_num == len(chapter_lines)
1583 else parse_duration(chapter_lines[next_num][1]))
1584 if end_time is None:
1585 continue
1586 if end_time > duration:
1587 end_time = duration
1588 if start_time > end_time:
1589 break
1590 chapter_title = re.sub(
1591 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1592 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1593 chapters.append({
1594 'start_time': start_time,
1595 'end_time': end_time,
1596 'title': chapter_title,
1597 })
1598 return chapters
1599
1600 def _real_extract(self, url):
1601 url, smuggled_data = unsmuggle_url(url, {})
1602
1603 proto = (
1604 'http' if self._downloader.params.get('prefer_insecure', False)
1605 else 'https')
1606
1607 start_time = None
1608 end_time = None
1609 parsed_url = compat_urllib_parse_urlparse(url)
1610 for component in [parsed_url.fragment, parsed_url.query]:
1611 query = compat_parse_qs(component)
1612 if start_time is None and 't' in query:
1613 start_time = parse_duration(query['t'][0])
1614 if start_time is None and 'start' in query:
1615 start_time = parse_duration(query['start'][0])
1616 if end_time is None and 'end' in query:
1617 end_time = parse_duration(query['end'][0])
1618
1619 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1620 mobj = re.search(self._NEXT_URL_RE, url)
1621 if mobj:
1622 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
1623 video_id = self.extract_id(url)
1624
1625 # Get video webpage
1626 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
1627 video_webpage = self._download_webpage(url, video_id)
1628
1629 # Attempt to extract SWF player URL
1630 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1631 if mobj is not None:
1632 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1633 else:
1634 player_url = None
1635
1636 dash_mpds = []
1637
1638 def add_dash_mpd(video_info):
1639 dash_mpd = video_info.get('dashmpd')
1640 if dash_mpd and dash_mpd[0] not in dash_mpds:
1641 dash_mpds.append(dash_mpd[0])
1642
1643 def add_dash_mpd_pr(pl_response):
1644 dash_mpd = url_or_none(try_get(
1645 pl_response, lambda x: x['streamingData']['dashManifestUrl'],
1646 compat_str))
1647 if dash_mpd and dash_mpd not in dash_mpds:
1648 dash_mpds.append(dash_mpd)
1649
1650 is_live = None
1651 view_count = None
1652
1653 def extract_view_count(v_info):
1654 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1655
1656 def extract_token(v_info):
1657 return dict_get(v_info, ('account_playback_token', 'accountPlaybackToken', 'token'))
1658
1659 player_response = {}
1660
1661 # Get video info
1662 embed_webpage = None
1663 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1664 age_gate = True
1665 # We simulate the access to the video from www.youtube.com/v/{video_id}
1666 # this can be viewed without login into Youtube
1667 url = proto + '://www.youtube.com/embed/%s' % video_id
1668 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
1669 data = compat_urllib_parse_urlencode({
1670 'video_id': video_id,
1671 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1672 'sts': self._search_regex(
1673 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1674 })
1675 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1676 video_info_webpage = self._download_webpage(
1677 video_info_url, video_id,
1678 note='Refetching age-gated info webpage',
1679 errnote='unable to download video info webpage')
1680 video_info = compat_parse_qs(video_info_webpage)
1681 add_dash_mpd(video_info)
1682 else:
1683 age_gate = False
1684 video_info = None
1685 sts = None
1686 # Try looking directly into the video webpage
1687 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1688 if ytplayer_config:
1689 args = ytplayer_config['args']
1690 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1691 # Convert to the same format returned by compat_parse_qs
1692 video_info = dict((k, [v]) for k, v in args.items())
1693 add_dash_mpd(video_info)
1694 # Rental video is not rented but preview is available (e.g.
1695 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1696 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1697 if not video_info and args.get('ypc_vid'):
1698 return self.url_result(
1699 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1700 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1701 is_live = True
1702 sts = ytplayer_config.get('sts')
1703 if not player_response:
1704 pl_response = str_or_none(args.get('player_response'))
1705 if pl_response:
1706 pl_response = self._parse_json(pl_response, video_id, fatal=False)
1707 if isinstance(pl_response, dict):
1708 player_response = pl_response
1709 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1710 add_dash_mpd_pr(player_response)
1711 # We also try looking in get_video_info since it may contain different dashmpd
1712 # URL that points to a DASH manifest with possibly different itag set (some itags
1713 # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
1714 # manifest pointed by get_video_info's dashmpd).
1715 # The general idea is to take a union of itags of both DASH manifests (for example
1716 # video with such 'manifest behavior' see https://github.com/ytdl-org/youtube-dl/issues/6093)
1717 self.report_video_info_webpage_download(video_id)
1718 for el in ('embedded', 'detailpage', 'vevo', ''):
1719 query = {
1720 'video_id': video_id,
1721 'ps': 'default',
1722 'eurl': '',
1723 'gl': 'US',
1724 'hl': 'en',
1725 }
1726 if el:
1727 query['el'] = el
1728 if sts:
1729 query['sts'] = sts
1730 video_info_webpage = self._download_webpage(
1731 '%s://www.youtube.com/get_video_info' % proto,
1732 video_id, note=False,
1733 errnote='unable to download video info webpage',
1734 fatal=False, query=query)
1735 if not video_info_webpage:
1736 continue
1737 get_video_info = compat_parse_qs(video_info_webpage)
1738 if not player_response:
1739 pl_response = get_video_info.get('player_response', [None])[0]
1740 if isinstance(pl_response, dict):
1741 player_response = pl_response
1742 add_dash_mpd_pr(player_response)
1743 add_dash_mpd(get_video_info)
1744 if view_count is None:
1745 view_count = extract_view_count(get_video_info)
1746 if not video_info:
1747 video_info = get_video_info
1748 get_token = extract_token(get_video_info)
1749 if get_token:
1750 # Different get_video_info requests may report different results, e.g.
1751 # some may report video unavailability, but some may serve it without
1752 # any complaint (see https://github.com/ytdl-org/youtube-dl/issues/7362,
1753 # the original webpage as well as el=info and el=embedded get_video_info
1754 # requests report video unavailability due to geo restriction while
1755 # el=detailpage succeeds and returns valid data). This is probably
1756 # due to YouTube measures against IP ranges of hosting providers.
1757 # Working around by preferring the first succeeded video_info containing
1758 # the token if no such video_info yet was found.
1759 token = extract_token(video_info)
1760 if not token:
1761 video_info = get_video_info
1762 break
1763
1764 def extract_unavailable_message():
1765 return self._html_search_regex(
1766 r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>',
1767 video_webpage, 'unavailable message', default=None)
1768
1769 if not video_info:
1770 unavailable_message = extract_unavailable_message()
1771 if not unavailable_message:
1772 unavailable_message = 'Unable to extract video data'
1773 raise ExtractorError(
1774 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
1775
1776 if video_info.get('license_info'):
1777 raise ExtractorError('This video is DRM protected.', expected=True)
1778
1779 video_details = try_get(
1780 player_response, lambda x: x['videoDetails'], dict) or {}
1781
1782 # title
1783 if 'title' in video_info:
1784 video_title = video_info['title'][0]
1785 elif 'title' in player_response:
1786 video_title = video_details['title']
1787 else:
1788 self._downloader.report_warning('Unable to extract video title')
1789 video_title = '_'
1790
1791 # description
1792 description_original = video_description = get_element_by_id("eow-description", video_webpage)
1793 if video_description:
1794
1795 def replace_url(m):
1796 redir_url = compat_urlparse.urljoin(url, m.group(1))
1797 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1798 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1799 qs = compat_parse_qs(parsed_redir_url.query)
1800 q = qs.get('q')
1801 if q and q[0]:
1802 return q[0]
1803 return redir_url
1804
1805 description_original = video_description = re.sub(r'''(?x)
1806 <a\s+
1807 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1808 (?:title|href)="([^"]+)"\s+
1809 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1810 class="[^"]*"[^>]*>
1811 [^<]+\.{3}\s*
1812 </a>
1813 ''', replace_url, video_description)
1814 video_description = clean_html(video_description)
1815 else:
1816 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1817 if fd_mobj:
1818 video_description = unescapeHTML(fd_mobj.group(1))
1819 else:
1820 video_description = ''
1821
1822 if not smuggled_data.get('force_singlefeed', False):
1823 if not self._downloader.params.get('noplaylist'):
1824 multifeed_metadata_list = try_get(
1825 player_response,
1826 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
1827 compat_str) or try_get(
1828 video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
1829 if multifeed_metadata_list:
1830 entries = []
1831 feed_ids = []
1832 for feed in multifeed_metadata_list.split(','):
1833 # Unquote should take place before split on comma (,) since textual
1834 # fields may contain comma as well (see
1835 # https://github.com/ytdl-org/youtube-dl/issues/8536)
1836 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
1837 entries.append({
1838 '_type': 'url_transparent',
1839 'ie_key': 'Youtube',
1840 'url': smuggle_url(
1841 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1842 {'force_singlefeed': True}),
1843 'title': '%s (%s)' % (video_title, feed_data['title'][0]),
1844 })
1845 feed_ids.append(feed_data['id'][0])
1846 self.to_screen(
1847 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1848 % (', '.join(feed_ids), video_id))
1849 return self.playlist_result(entries, video_id, video_title, video_description)
1850 else:
1851 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1852
1853 if view_count is None:
1854 view_count = extract_view_count(video_info)
1855 if view_count is None and video_details:
1856 view_count = int_or_none(video_details.get('viewCount'))
1857
1858 # Check for "rental" videos
1859 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1860 raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
1861
1862 def _extract_filesize(media_url):
1863 return int_or_none(self._search_regex(
1864 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
1865
1866 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1867 self.report_rtmp_download()
1868 formats = [{
1869 'format_id': '_rtmp',
1870 'protocol': 'rtmp',
1871 'url': video_info['conn'][0],
1872 'player_url': player_url,
1873 }]
1874 elif not is_live and (len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
1875 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
1876 if 'rtmpe%3Dyes' in encoded_url_map:
1877 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
1878 formats_spec = {}
1879 fmt_list = video_info.get('fmt_list', [''])[0]
1880 if fmt_list:
1881 for fmt in fmt_list.split(','):
1882 spec = fmt.split('/')
1883 if len(spec) > 1:
1884 width_height = spec[1].split('x')
1885 if len(width_height) == 2:
1886 formats_spec[spec[0]] = {
1887 'resolution': spec[1],
1888 'width': int_or_none(width_height[0]),
1889 'height': int_or_none(width_height[1]),
1890 }
1891 q = qualities(['small', 'medium', 'hd720'])
1892 streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list)
1893 if streaming_formats:
1894 for fmt in streaming_formats:
1895 itag = str_or_none(fmt.get('itag'))
1896 if not itag:
1897 continue
1898 quality = fmt.get('quality')
1899 quality_label = fmt.get('qualityLabel') or quality
1900 formats_spec[itag] = {
1901 'asr': int_or_none(fmt.get('audioSampleRate')),
1902 'filesize': int_or_none(fmt.get('contentLength')),
1903 'format_note': quality_label,
1904 'fps': int_or_none(fmt.get('fps')),
1905 'height': int_or_none(fmt.get('height')),
1906 'quality': q(quality),
1907 # bitrate for itag 43 is always 2147483647
1908 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
1909 'width': int_or_none(fmt.get('width')),
1910 }
1911 formats = []
1912 for url_data_str in encoded_url_map.split(','):
1913 url_data = compat_parse_qs(url_data_str)
1914 if 'itag' not in url_data or 'url' not in url_data:
1915 continue
1916 stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
1917 # Unsupported FORMAT_STREAM_TYPE_OTF
1918 if stream_type == 3:
1919 continue
1920 format_id = url_data['itag'][0]
1921 url = url_data['url'][0]
1922
1923 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
1924 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
1925 jsplayer_url_json = self._search_regex(
1926 ASSETS_RE,
1927 embed_webpage if age_gate else video_webpage,
1928 'JS player URL (1)', default=None)
1929 if not jsplayer_url_json and not age_gate:
1930 # We need the embed website after all
1931 if embed_webpage is None:
1932 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1933 embed_webpage = self._download_webpage(
1934 embed_url, video_id, 'Downloading embed webpage')
1935 jsplayer_url_json = self._search_regex(
1936 ASSETS_RE, embed_webpage, 'JS player URL')
1937
1938 player_url = json.loads(jsplayer_url_json)
1939 if player_url is None:
1940 player_url_json = self._search_regex(
1941 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
1942 video_webpage, 'age gate player URL')
1943 player_url = json.loads(player_url_json)
1944
1945 if 'sig' in url_data:
1946 url += '&signature=' + url_data['sig'][0]
1947 elif 's' in url_data:
1948 encrypted_sig = url_data['s'][0]
1949
1950 if self._downloader.params.get('verbose'):
1951 if player_url is None:
1952 player_version = 'unknown'
1953 player_desc = 'unknown'
1954 else:
1955 if player_url.endswith('swf'):
1956 player_version = self._search_regex(
1957 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
1958 'flash player', fatal=False)
1959 player_desc = 'flash player %s' % player_version
1960 else:
1961 player_version = self._search_regex(
1962 [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',
1963 r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'],
1964 player_url,
1965 'html5 player', fatal=False)
1966 player_desc = 'html5 player %s' % player_version
1967
1968 parts_sizes = self._signature_cache_id(encrypted_sig)
1969 self.to_screen('{%s} signature length %s, %s' %
1970 (format_id, parts_sizes, player_desc))
1971
1972 signature = self._decrypt_signature(
1973 encrypted_sig, video_id, player_url, age_gate)
1974 url += '&signature=' + signature
1975 if 'ratebypass' not in url:
1976 url += '&ratebypass=yes'
1977
1978 dct = {
1979 'format_id': format_id,
1980 'url': url,
1981 'player_url': player_url,
1982 }
1983 if format_id in self._formats:
1984 dct.update(self._formats[format_id])
1985 if format_id in formats_spec:
1986 dct.update(formats_spec[format_id])
1987
1988 # Some itags are not included in DASH manifest thus corresponding formats will
1989 # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).
1990 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
1991 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
1992 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
1993
1994 filesize = int_or_none(url_data.get(
1995 'clen', [None])[0]) or _extract_filesize(url)
1996
1997 quality = url_data.get('quality', [None])[0]
1998
1999 more_fields = {
2000 'filesize': filesize,
2001 'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000),
2002 'width': width,
2003 'height': height,
2004 'fps': int_or_none(url_data.get('fps', [None])[0]),
2005 'format_note': url_data.get('quality_label', [None])[0] or quality,
2006 'quality': q(quality),
2007 }
2008 for key, value in more_fields.items():
2009 if value:
2010 dct[key] = value
2011 type_ = url_data.get('type', [None])[0]
2012 if type_:
2013 type_split = type_.split(';')
2014 kind_ext = type_split[0].split('/')
2015 if len(kind_ext) == 2:
2016 kind, _ = kind_ext
2017 dct['ext'] = mimetype2ext(type_split[0])
2018 if kind in ('audio', 'video'):
2019 codecs = None
2020 for mobj in re.finditer(
2021 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
2022 if mobj.group('key') == 'codecs':
2023 codecs = mobj.group('val')
2024 break
2025 if codecs:
2026 dct.update(parse_codecs(codecs))
2027 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
2028 dct['downloader_options'] = {
2029 # Youtube throttles chunks >~10M
2030 'http_chunk_size': 10485760,
2031 }
2032 formats.append(dct)
2033 else:
2034 manifest_url = (
2035 url_or_none(try_get(
2036 player_response,
2037 lambda x: x['streamingData']['hlsManifestUrl'],
2038 compat_str)) or
2039 url_or_none(try_get(
2040 video_info, lambda x: x['hlsvp'][0], compat_str)))
2041 if manifest_url:
2042 formats = []
2043 m3u8_formats = self._extract_m3u8_formats(
2044 manifest_url, video_id, 'mp4', fatal=False)
2045 for a_format in m3u8_formats:
2046 itag = self._search_regex(
2047 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
2048 if itag:
2049 a_format['format_id'] = itag
2050 if itag in self._formats:
2051 dct = self._formats[itag].copy()
2052 dct.update(a_format)
2053 a_format = dct
2054 a_format['player_url'] = player_url
2055 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
2056 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
2057 formats.append(a_format)
2058 else:
2059 error_message = clean_html(video_info.get('reason', [None])[0])
2060 if not error_message:
2061 error_message = extract_unavailable_message()
2062 if error_message:
2063 raise ExtractorError(error_message, expected=True)
2064 raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
2065
2066 # uploader
2067 video_uploader = try_get(
2068 video_info, lambda x: x['author'][0],
2069 compat_str) or str_or_none(video_details.get('author'))
2070 if video_uploader:
2071 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
2072 else:
2073 self._downloader.report_warning('unable to extract uploader name')
2074
2075 # uploader_id
2076 video_uploader_id = None
2077 video_uploader_url = None
2078 mobj = re.search(
2079 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
2080 video_webpage)
2081 if mobj is not None:
2082 video_uploader_id = mobj.group('uploader_id')
2083 video_uploader_url = mobj.group('uploader_url')
2084 else:
2085 self._downloader.report_warning('unable to extract uploader nickname')
2086
2087 channel_id = self._html_search_meta(
2088 'channelId', video_webpage, 'channel id')
2089 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
2090
2091 # thumbnail image
2092 # We try first to get a high quality image:
2093 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
2094 video_webpage, re.DOTALL)
2095 if m_thumb is not None:
2096 video_thumbnail = m_thumb.group(1)
2097 elif 'thumbnail_url' not in video_info:
2098 self._downloader.report_warning('unable to extract video thumbnail')
2099 video_thumbnail = None
2100 else: # don't panic if we can't find it
2101 video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
2102
2103 # upload date
2104 upload_date = self._html_search_meta(
2105 'datePublished', video_webpage, 'upload date', default=None)
2106 if not upload_date:
2107 upload_date = self._search_regex(
2108 [r'(?s)id="eow-date.*?>(.*?)</span>',
2109 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
2110 video_webpage, 'upload date', default=None)
2111 upload_date = unified_strdate(upload_date)
2112
2113 video_license = self._html_search_regex(
2114 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
2115 video_webpage, 'license', default=None)
2116
2117 m_music = re.search(
2118 r'''(?x)
2119 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
2120 <ul[^>]*>\s*
2121 <li>(?P<title>.+?)
2122 by (?P<creator>.+?)
2123 (?:
2124 \(.+?\)|
2125 <a[^>]*
2126 (?:
2127 \bhref=["\']/red[^>]*>| # drop possible
2128 >\s*Listen ad-free with YouTube Red # YouTube Red ad
2129 )
2130 .*?
2131 )?</li
2132 ''',
2133 video_webpage)
2134 if m_music:
2135 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
2136 video_creator = clean_html(m_music.group('creator'))
2137 else:
2138 video_alt_title = video_creator = None
2139
2140 def extract_meta(field):
2141 return self._html_search_regex(
2142 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
2143 video_webpage, field, default=None)
2144
2145 track = extract_meta('Song')
2146 artist = extract_meta('Artist')
2147 album = extract_meta('Album')
2148
2149 # Youtube Music Auto-generated description
2150 release_date = release_year = None
2151 if video_description:
2152 mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description)
2153 if mobj:
2154 if not track:
2155 track = mobj.group('track').strip()
2156 if not artist:
2157 artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))
2158 if not album:
2159 album = mobj.group('album'.strip())
2160 release_year = mobj.group('release_year')
2161 release_date = mobj.group('release_date')
2162 if release_date:
2163 release_date = release_date.replace('-', '')
2164 if not release_year:
2165 release_year = int(release_date[:4])
2166 if release_year:
2167 release_year = int(release_year)
2168
2169 m_episode = re.search(
2170 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
2171 video_webpage)
2172 if m_episode:
2173 series = unescapeHTML(m_episode.group('series'))
2174 season_number = int(m_episode.group('season'))
2175 episode_number = int(m_episode.group('episode'))
2176 else:
2177 series = season_number = episode_number = None
2178
2179 m_cat_container = self._search_regex(
2180 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
2181 video_webpage, 'categories', default=None)
2182 if m_cat_container:
2183 category = self._html_search_regex(
2184 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
2185 default=None)
2186 video_categories = None if category is None else [category]
2187 else:
2188 video_categories = None
2189
2190 video_tags = [
2191 unescapeHTML(m.group('content'))
2192 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
2193
2194 def _extract_count(count_name):
2195 return str_to_int(self._search_regex(
2196 r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
2197 % re.escape(count_name),
2198 video_webpage, count_name, default=None))
2199
2200 like_count = _extract_count('like')
2201 dislike_count = _extract_count('dislike')
2202
2203 if view_count is None:
2204 view_count = str_to_int(self._search_regex(
2205 r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
2206 'view count', default=None))
2207
2208 # subtitles
2209 video_subtitles = self.extract_subtitles(video_id, video_webpage)
2210 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
2211
2212 video_duration = try_get(
2213 video_info, lambda x: int_or_none(x['length_seconds'][0]))
2214 if not video_duration:
2215 video_duration = int_or_none(video_details.get('lengthSeconds'))
2216 if not video_duration:
2217 video_duration = parse_duration(self._html_search_meta(
2218 'duration', video_webpage, 'video duration'))
2219
2220 # annotations
2221 video_annotations = None
2222 if self._downloader.params.get('writeannotations', False):
2223 video_annotations = self._extract_annotations(video_id)
2224
2225 chapters = self._extract_chapters(description_original, video_duration)
2226
2227 # Look for the DASH manifest
2228 if self._downloader.params.get('youtube_include_dash_manifest', True):
2229 dash_mpd_fatal = True
2230 for mpd_url in dash_mpds:
2231 dash_formats = {}
2232 try:
2233 def decrypt_sig(mobj):
2234 s = mobj.group(1)
2235 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2236 return '/signature/%s' % dec_s
2237
2238 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2239
2240 for df in self._extract_mpd_formats(
2241 mpd_url, video_id, fatal=dash_mpd_fatal,
2242 formats_dict=self._formats):
2243 if not df.get('filesize'):
2244 df['filesize'] = _extract_filesize(df['url'])
2245 # Do not overwrite DASH format found in some previous DASH manifest
2246 if df['format_id'] not in dash_formats:
2247 dash_formats[df['format_id']] = df
2248 # Additional DASH manifests may end up in HTTP Error 403 therefore
2249 # allow them to fail without bug report message if we already have
2250 # some DASH manifest succeeded. This is temporary workaround to reduce
2251 # burst of bug reports until we figure out the reason and whether it
2252 # can be fixed at all.
2253 dash_mpd_fatal = False
2254 except (ExtractorError, KeyError) as e:
2255 self.report_warning(
2256 'Skipping DASH manifest: %r' % e, video_id)
2257 if dash_formats:
2258 # Remove the formats we found through non-DASH, they
2259 # contain less info and it can be wrong, because we use
2260 # fixed values (for example the resolution). See
2261 # https://github.com/ytdl-org/youtube-dl/issues/5774 for an
2262 # example.
2263 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
2264 formats.extend(dash_formats.values())
2265
2266 # Check for malformed aspect ratio
2267 stretched_m = re.search(
2268 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2269 video_webpage)
2270 if stretched_m:
2271 w = float(stretched_m.group('w'))
2272 h = float(stretched_m.group('h'))
2273 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2274 # We will only process correct ratios.
2275 if w > 0 and h > 0:
2276 ratio = w / h
2277 for f in formats:
2278 if f.get('vcodec') != 'none':
2279 f['stretched_ratio'] = ratio
2280
2281 if not formats:
2282 token = extract_token(video_info)
2283 if not token:
2284 if 'reason' in video_info:
2285 if 'The uploader has not made this video available in your country.' in video_info['reason']:
2286 regions_allowed = self._html_search_meta(
2287 'regionsAllowed', video_webpage, default=None)
2288 countries = regions_allowed.split(',') if regions_allowed else None
2289 self.raise_geo_restricted(
2290 msg=video_info['reason'][0], countries=countries)
2291 reason = video_info['reason'][0]
2292 if 'Invalid parameters' in reason:
2293 unavailable_message = extract_unavailable_message()
2294 if unavailable_message:
2295 reason = unavailable_message
2296 raise ExtractorError(
2297 'YouTube said: %s' % reason,
2298 expected=True, video_id=video_id)
2299 else:
2300 raise ExtractorError(
2301 '"token" parameter not in video info for unknown reason',
2302 video_id=video_id)
2303
2304 self._sort_formats(formats)
2305
2306 self.mark_watched(video_id, video_info, player_response)
2307
2308 return {
2309 'id': video_id,
2310 'uploader': video_uploader,
2311 'uploader_id': video_uploader_id,
2312 'uploader_url': video_uploader_url,
2313 'channel_id': channel_id,
2314 'channel_url': channel_url,
2315 'upload_date': upload_date,
2316 'license': video_license,
2317 'creator': video_creator or artist,
2318 'title': video_title,
2319 'alt_title': video_alt_title or track,
2320 'thumbnail': video_thumbnail,
2321 'description': video_description,
2322 'categories': video_categories,
2323 'tags': video_tags,
2324 'subtitles': video_subtitles,
2325 'automatic_captions': automatic_captions,
2326 'duration': video_duration,
2327 'age_limit': 18 if age_gate else 0,
2328 'annotations': video_annotations,
2329 'chapters': chapters,
2330 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
2331 'view_count': view_count,
2332 'like_count': like_count,
2333 'dislike_count': dislike_count,
2334 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
2335 'formats': formats,
2336 'is_live': is_live,
2337 'start_time': start_time,
2338 'end_time': end_time,
2339 'series': series,
2340 'season_number': season_number,
2341 'episode_number': episode_number,
2342 'track': track,
2343 'artist': artist,
2344 'album': album,
2345 'release_date': release_date,
2346 'release_year': release_year,
2347 }
2348
2349
2350 class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
2351 IE_DESC = 'YouTube.com playlists'
2352 _VALID_URL = r"""(?x)(?:
2353 (?:https?://)?
2354 (?:\w+\.)?
2355 (?:
2356 (?:
2357 youtube\.com|
2358 invidio\.us
2359 )
2360 /
2361 (?:
2362 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))
2363 \? (?:.*?[&;])*? (?:p|a|list)=
2364 | p/
2365 )|
2366 youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
2367 )
2368 (
2369 (?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)?[0-9A-Za-z-_]{10,}
2370 # Top tracks, they can also include dots
2371 |(?:MC)[\w\.]*
2372 )
2373 .*
2374 |
2375 (%(playlist_id)s)
2376 )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
2377 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
2378 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'
2379 IE_NAME = 'youtube:playlist'
2380 _TESTS = [{
2381 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
2382 'info_dict': {
2383 'title': 'ytdl test PL',
2384 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
2385 },
2386 'playlist_count': 3,
2387 }, {
2388 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
2389 'info_dict': {
2390 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
2391 'title': 'YDL_Empty_List',
2392 },
2393 'playlist_count': 0,
2394 'skip': 'This playlist is private',
2395 }, {
2396 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2397 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2398 'info_dict': {
2399 'title': '29C3: Not my department',
2400 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2401 },
2402 'playlist_count': 95,
2403 }, {
2404 'note': 'issue #673',
2405 'url': 'PLBB231211A4F62143',
2406 'info_dict': {
2407 'title': '[OLD]Team Fortress 2 (Class-based LP)',
2408 'id': 'PLBB231211A4F62143',
2409 },
2410 'playlist_mincount': 26,
2411 }, {
2412 'note': 'Large playlist',
2413 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
2414 'info_dict': {
2415 'title': 'Uploads from Cauchemar',
2416 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
2417 },
2418 'playlist_mincount': 799,
2419 }, {
2420 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2421 'info_dict': {
2422 'title': 'YDL_safe_search',
2423 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2424 },
2425 'playlist_count': 2,
2426 'skip': 'This playlist is private',
2427 }, {
2428 'note': 'embedded',
2429 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
2430 'playlist_count': 4,
2431 'info_dict': {
2432 'title': 'JODA15',
2433 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
2434 }
2435 }, {
2436 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2437 'playlist_mincount': 485,
2438 'info_dict': {
2439 'title': '2017 華語最新單曲 (2/24更新)',
2440 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2441 }
2442 }, {
2443 'note': 'Embedded SWF player',
2444 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
2445 'playlist_count': 4,
2446 'info_dict': {
2447 'title': 'JODA7',
2448 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
2449 }
2450 }, {
2451 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2452 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2453 'info_dict': {
2454 'title': 'Uploads from Interstellar Movie',
2455 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
2456 },
2457 'playlist_mincount': 21,
2458 }, {
2459 # Playlist URL that does not actually serve a playlist
2460 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2461 'info_dict': {
2462 'id': 'FqZTN594JQw',
2463 'ext': 'webm',
2464 'title': "Smiley's People 01 detective, Adventure Series, Action",
2465 'uploader': 'STREEM',
2466 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
2467 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
2468 'upload_date': '20150526',
2469 'license': 'Standard YouTube License',
2470 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2471 'categories': ['People & Blogs'],
2472 'tags': list,
2473 'view_count': int,
2474 'like_count': int,
2475 'dislike_count': int,
2476 },
2477 'params': {
2478 'skip_download': True,
2479 },
2480 'add_ie': [YoutubeIE.ie_key()],
2481 }, {
2482 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
2483 'info_dict': {
2484 'id': 'yeWKywCrFtk',
2485 'ext': 'mp4',
2486 'title': 'Small Scale Baler and Braiding Rugs',
2487 'uploader': 'Backus-Page House Museum',
2488 'uploader_id': 'backuspagemuseum',
2489 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
2490 'upload_date': '20161008',
2491 'license': 'Standard YouTube License',
2492 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
2493 'categories': ['Nonprofits & Activism'],
2494 'tags': list,
2495 'like_count': int,
2496 'dislike_count': int,
2497 },
2498 'params': {
2499 'noplaylist': True,
2500 'skip_download': True,
2501 },
2502 }, {
2503 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
2504 'only_matching': True,
2505 }, {
2506 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
2507 'only_matching': True,
2508 }, {
2509 # music album playlist
2510 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
2511 'only_matching': True,
2512 }, {
2513 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',
2514 'only_matching': True,
2515 }]
2516
2517 def _real_initialize(self):
2518 self._login()
2519
2520 def _extract_mix(self, playlist_id):
2521 # The mixes are generated from a single video
2522 # the id of the playlist is just 'RD' + video_id
2523 ids = []
2524 last_id = playlist_id[-11:]
2525 for n in itertools.count(1):
2526 url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
2527 webpage = self._download_webpage(
2528 url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))
2529 new_ids = orderedSet(re.findall(
2530 r'''(?xs)data-video-username=".*?".*?
2531 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
2532 webpage))
2533 # Fetch new pages until all the videos are repeated, it seems that
2534 # there are always 51 unique videos.
2535 new_ids = [_id for _id in new_ids if _id not in ids]
2536 if not new_ids:
2537 break
2538 ids.extend(new_ids)
2539 last_id = ids[-1]
2540
2541 url_results = self._ids_to_results(ids)
2542
2543 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
2544 title_span = (
2545 search_title('playlist-title') or
2546 search_title('title long-title') or
2547 search_title('title'))
2548 title = clean_html(title_span)
2549
2550 return self.playlist_result(url_results, playlist_id, title)
2551
2552 def _extract_playlist(self, playlist_id):
2553 url = self._TEMPLATE_URL % playlist_id
2554 page = self._download_webpage(url, playlist_id)
2555
2556 # the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604)
2557 for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page):
2558 match = match.strip()
2559 # Check if the playlist exists or is private
2560 mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match)
2561 if mobj:
2562 reason = mobj.group('reason')
2563 message = 'This playlist %s' % reason
2564 if 'private' in reason:
2565 message += ', use --username or --netrc to access it'
2566 message += '.'
2567 raise ExtractorError(message, expected=True)
2568 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
2569 raise ExtractorError(
2570 'Invalid parameters. Maybe URL is incorrect.',
2571 expected=True)
2572 elif re.match(r'[^<]*Choose your language[^<]*', match):
2573 continue
2574 else:
2575 self.report_warning('Youtube gives an alert message: ' + match)
2576
2577 playlist_title = self._html_search_regex(
2578 r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
2579 page, 'title', default=None)
2580
2581 _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref='
2582 uploader = self._search_regex(
2583 r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,
2584 page, 'uploader', default=None)
2585 mobj = re.search(
2586 r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE,
2587 page)
2588 if mobj:
2589 uploader_id = mobj.group('uploader_id')
2590 uploader_url = compat_urlparse.urljoin(url, mobj.group('path'))
2591 else:
2592 uploader_id = uploader_url = None
2593
2594 has_videos = True
2595
2596 if not playlist_title:
2597 try:
2598 # Some playlist URLs don't actually serve a playlist (e.g.
2599 # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)
2600 next(self._entries(page, playlist_id))
2601 except StopIteration:
2602 has_videos = False
2603
2604 playlist = self.playlist_result(
2605 self._entries(page, playlist_id), playlist_id, playlist_title)
2606 playlist.update({
2607 'uploader': uploader,
2608 'uploader_id': uploader_id,
2609 'uploader_url': uploader_url,
2610 })
2611
2612 return has_videos, playlist
2613
2614 def _check_download_just_video(self, url, playlist_id):
2615 # Check if it's a video-specific URL
2616 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
2617 video_id = query_dict.get('v', [None])[0] or self._search_regex(
2618 r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url,
2619 'video id', default=None)
2620 if video_id:
2621 if self._downloader.params.get('noplaylist'):
2622 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
2623 return video_id, self.url_result(video_id, 'Youtube', video_id=video_id)
2624 else:
2625 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
2626 return video_id, None
2627 return None, None
2628
2629 def _real_extract(self, url):
2630 # Extract playlist id
2631 mobj = re.match(self._VALID_URL, url)
2632 if mobj is None:
2633 raise ExtractorError('Invalid URL: %s' % url)
2634 playlist_id = mobj.group(1) or mobj.group(2)
2635
2636 video_id, video = self._check_download_just_video(url, playlist_id)
2637 if video:
2638 return video
2639
2640 if playlist_id.startswith(('RD', 'UL', 'PU')):
2641 # Mixes require a custom extraction process
2642 return self._extract_mix(playlist_id)
2643
2644 has_videos, playlist = self._extract_playlist(playlist_id)
2645 if has_videos or not video_id:
2646 return playlist
2647
2648 # Some playlist URLs don't actually serve a playlist (see
2649 # https://github.com/ytdl-org/youtube-dl/issues/10537).
2650 # Fallback to plain video extraction if there is a video id
2651 # along with playlist id.
2652 return self.url_result(video_id, 'Youtube', video_id=video_id)
2653
2654
2655 class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
2656 IE_DESC = 'YouTube.com channels'
2657 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)'
2658 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
2659 _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
2660 IE_NAME = 'youtube:channel'
2661 _TESTS = [{
2662 'note': 'paginated channel',
2663 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
2664 'playlist_mincount': 91,
2665 'info_dict': {
2666 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
2667 'title': 'Uploads from lex will',
2668 }
2669 }, {
2670 'note': 'Age restricted channel',
2671 # from https://www.youtube.com/user/DeusExOfficial
2672 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',
2673 'playlist_mincount': 64,
2674 'info_dict': {
2675 'id': 'UUs0ifCMCm1icqRbqhUINa0w',
2676 'title': 'Uploads from Deus Ex',
2677 },
2678 }, {
2679 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',
2680 'only_matching': True,
2681 }]
2682
2683 @classmethod
2684 def suitable(cls, url):
2685 return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)
2686 else super(YoutubeChannelIE, cls).suitable(url))
2687
2688 def _build_template_url(self, url, channel_id):
2689 return self._TEMPLATE_URL % channel_id
2690
2691 def _real_extract(self, url):
2692 channel_id = self._match_id(url)
2693
2694 url = self._build_template_url(url, channel_id)
2695
2696 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
2697 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
2698 # otherwise fallback on channel by page extraction
2699 channel_page = self._download_webpage(
2700 url + '?view=57', channel_id,
2701 'Downloading channel page', fatal=False)
2702 if channel_page is False:
2703 channel_playlist_id = False
2704 else:
2705 channel_playlist_id = self._html_search_meta(
2706 'channelId', channel_page, 'channel id', default=None)
2707 if not channel_playlist_id:
2708 channel_url = self._html_search_meta(
2709 ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),
2710 channel_page, 'channel url', default=None)
2711 if channel_url:
2712 channel_playlist_id = self._search_regex(
2713 r'vnd\.youtube://user/([0-9A-Za-z_-]+)',
2714 channel_url, 'channel id', default=None)
2715 if channel_playlist_id and channel_playlist_id.startswith('UC'):
2716 playlist_id = 'UU' + channel_playlist_id[2:]
2717 return self.url_result(
2718 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
2719
2720 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
2721 autogenerated = re.search(r'''(?x)
2722 class="[^"]*?(?:
2723 channel-header-autogenerated-label|
2724 yt-channel-title-autogenerated
2725 )[^"]*"''', channel_page) is not None
2726
2727 if autogenerated:
2728 # The videos are contained in a single page
2729 # the ajax pages can't be used, they are empty
2730 entries = [
2731 self.url_result(
2732 video_id, 'Youtube', video_id=video_id,
2733 video_title=video_title)
2734 for video_id, video_title in self.extract_videos_from_page(channel_page)]
2735 return self.playlist_result(entries, channel_id)
2736
2737 try:
2738 next(self._entries(channel_page, channel_id))
2739 except StopIteration:
2740 alert_message = self._html_search_regex(
2741 r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',
2742 channel_page, 'alert', default=None, group='alert')
2743 if alert_message:
2744 raise ExtractorError('Youtube said: %s' % alert_message, expected=True)
2745
2746 return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
2747
2748
2749 class YoutubeUserIE(YoutubeChannelIE):
2750 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
2751 _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
2752 _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
2753 IE_NAME = 'youtube:user'
2754
2755 _TESTS = [{
2756 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
2757 'playlist_mincount': 320,
2758 'info_dict': {
2759 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
2760 'title': 'Uploads from The Linux Foundation',
2761 }
2762 }, {
2763 # Only available via https://www.youtube.com/c/12minuteathlete/videos
2764 # but not https://www.youtube.com/user/12minuteathlete/videos
2765 'url': 'https://www.youtube.com/c/12minuteathlete/videos',
2766 'playlist_mincount': 249,
2767 'info_dict': {
2768 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
2769 'title': 'Uploads from 12 Minute Athlete',
2770 }
2771 }, {
2772 'url': 'ytuser:phihag',
2773 'only_matching': True,
2774 }, {
2775 'url': 'https://www.youtube.com/c/gametrailers',
2776 'only_matching': True,
2777 }, {
2778 'url': 'https://www.youtube.com/gametrailers',
2779 'only_matching': True,
2780 }, {
2781 # This channel is not available, geo restricted to JP
2782 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',
2783 'only_matching': True,
2784 }]
2785
2786 @classmethod
2787 def suitable(cls, url):
2788 # Don't return True if the url can be extracted with other youtube
2789 # extractor, the regex would is too permissive and it would match.
2790 other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)
2791 if any(ie.suitable(url) for ie in other_yt_ies):
2792 return False
2793 else:
2794 return super(YoutubeUserIE, cls).suitable(url)
2795
2796 def _build_template_url(self, url, channel_id):
2797 mobj = re.match(self._VALID_URL, url)
2798 return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))
2799
2800
2801 class YoutubeLiveIE(YoutubeBaseInfoExtractor):
2802 IE_DESC = 'YouTube.com live streams'
2803 _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'
2804 IE_NAME = 'youtube:live'
2805
2806 _TESTS = [{
2807 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
2808 'info_dict': {
2809 'id': 'a48o2S1cPoo',
2810 'ext': 'mp4',
2811 'title': 'The Young Turks - Live Main Show',
2812 'uploader': 'The Young Turks',
2813 'uploader_id': 'TheYoungTurks',
2814 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
2815 'upload_date': '20150715',
2816 'license': 'Standard YouTube License',
2817 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
2818 'categories': ['News & Politics'],
2819 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
2820 'like_count': int,
2821 'dislike_count': int,
2822 },
2823 'params': {
2824 'skip_download': True,
2825 },
2826 }, {
2827 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
2828 'only_matching': True,
2829 }, {
2830 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
2831 'only_matching': True,
2832 }, {
2833 'url': 'https://www.youtube.com/TheYoungTurks/live',
2834 'only_matching': True,
2835 }]
2836
2837 def _real_extract(self, url):
2838 mobj = re.match(self._VALID_URL, url)
2839 channel_id = mobj.group('id')
2840 base_url = mobj.group('base_url')
2841 webpage = self._download_webpage(url, channel_id, fatal=False)
2842 if webpage:
2843 page_type = self._og_search_property(
2844 'type', webpage, 'page type', default='')
2845 video_id = self._html_search_meta(
2846 'videoId', webpage, 'video id', default=None)
2847 if page_type.startswith('video') and video_id and re.match(
2848 r'^[0-9A-Za-z_-]{11}$', video_id):
2849 return self.url_result(video_id, YoutubeIE.ie_key())
2850 return self.url_result(base_url)
2851
2852
2853 class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
2854 IE_DESC = 'YouTube.com user/channel playlists'
2855 _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists'
2856 IE_NAME = 'youtube:playlists'
2857
2858 _TESTS = [{
2859 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
2860 'playlist_mincount': 4,
2861 'info_dict': {
2862 'id': 'ThirstForScience',
2863 'title': 'Thirst for Science',
2864 },
2865 }, {
2866 # with "Load more" button
2867 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
2868 'playlist_mincount': 70,
2869 'info_dict': {
2870 'id': 'igorkle1',
2871 'title': 'Игорь Клейнер',
2872 },
2873 }, {
2874 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',
2875 'playlist_mincount': 17,
2876 'info_dict': {
2877 'id': 'UCiU1dHvZObB2iP6xkJ__Icw',
2878 'title': 'Chem Player',
2879 },
2880 }]
2881
2882
2883 class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):
2884 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'
2885
2886
2887 class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
2888 IE_DESC = 'YouTube.com searches'
2889 # there doesn't appear to be a real limit, for example if you search for
2890 # 'python' you get more than 8.000.000 results
2891 _MAX_RESULTS = float('inf')
2892 IE_NAME = 'youtube:search'
2893 _SEARCH_KEY = 'ytsearch'
2894 _EXTRA_QUERY_ARGS = {}
2895 _TESTS = []
2896
2897 def _get_n_results(self, query, n):
2898 """Get a specified number of results for a query"""
2899
2900 videos = []
2901 limit = n
2902
2903 url_query = {
2904 'search_query': query.encode('utf-8'),
2905 }
2906 url_query.update(self._EXTRA_QUERY_ARGS)
2907 result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query)
2908
2909 for pagenum in itertools.count(1):
2910 data = self._download_json(
2911 result_url, video_id='query "%s"' % query,
2912 note='Downloading page %s' % pagenum,
2913 errnote='Unable to download API page',
2914 query={'spf': 'navigate'})
2915 html_content = data[1]['body']['content']
2916
2917 if 'class="search-message' in html_content:
2918 raise ExtractorError(
2919 '[youtube] No video results', expected=True)
2920
2921 new_videos = list(self._process_page(html_content))
2922 videos += new_videos
2923 if not new_videos or len(videos) > limit:
2924 break
2925 next_link = self._html_search_regex(
2926 r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next',
2927 html_content, 'next link', default=None)
2928 if next_link is None:
2929 break
2930 result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link)
2931
2932 if len(videos) > n:
2933 videos = videos[:n]
2934 return self.playlist_result(videos, query)
2935
2936
2937 class YoutubeSearchDateIE(YoutubeSearchIE):
2938 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
2939 _SEARCH_KEY = 'ytsearchdate'
2940 IE_DESC = 'YouTube.com searches, newest videos first'
2941 _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
2942
2943
2944 class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):
2945 IE_DESC = 'YouTube.com search URLs'
2946 IE_NAME = 'youtube:search_url'
2947 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
2948 _TESTS = [{
2949 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
2950 'playlist_mincount': 5,
2951 'info_dict': {
2952 'title': 'youtube-dl test video',
2953 }
2954 }, {
2955 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
2956 'only_matching': True,
2957 }]
2958
2959 def _real_extract(self, url):
2960 mobj = re.match(self._VALID_URL, url)
2961 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
2962 webpage = self._download_webpage(url, query)
2963 return self.playlist_result(self._process_page(webpage), playlist_title=query)
2964
2965
2966 class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
2967 IE_DESC = 'YouTube.com (multi-season) shows'
2968 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
2969 IE_NAME = 'youtube:show'
2970 _TESTS = [{
2971 'url': 'https://www.youtube.com/show/airdisasters',
2972 'playlist_mincount': 5,
2973 'info_dict': {
2974 'id': 'airdisasters',
2975 'title': 'Air Disasters',
2976 }
2977 }]
2978
2979 def _real_extract(self, url):
2980 playlist_id = self._match_id(url)
2981 return super(YoutubeShowIE, self)._real_extract(
2982 'https://www.youtube.com/show/%s/playlists' % playlist_id)
2983
2984
2985 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
2986 """
2987 Base class for feed extractors
2988 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
2989 """
2990 _LOGIN_REQUIRED = True
2991
2992 @property
2993 def IE_NAME(self):
2994 return 'youtube:%s' % self._FEED_NAME
2995
2996 def _real_initialize(self):
2997 self._login()
2998
2999 def _entries(self, page):
3000 # The extraction process is the same as for playlists, but the regex
3001 # for the video ids doesn't contain an index
3002 ids = []
3003 more_widget_html = content_html = page
3004 for page_num in itertools.count(1):
3005 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
3006
3007 # 'recommended' feed has infinite 'load more' and each new portion spins
3008 # the same videos in (sometimes) slightly different order, so we'll check
3009 # for unicity and break when portion has no new videos
3010 new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches)))
3011 if not new_ids:
3012 break
3013
3014 ids.extend(new_ids)
3015
3016 for entry in self._ids_to_results(new_ids):
3017 yield entry
3018
3019 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
3020 if not mobj:
3021 break
3022
3023 more = self._download_json(
3024 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
3025 'Downloading page #%s' % page_num,
3026 transform_source=uppercase_escape)
3027 content_html = more['content_html']
3028 more_widget_html = more['load_more_widget_html']
3029
3030 def _real_extract(self, url):
3031 page = self._download_webpage(
3032 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
3033 self._PLAYLIST_TITLE)
3034 return self.playlist_result(
3035 self._entries(page), playlist_title=self._PLAYLIST_TITLE)
3036
3037
3038 class YoutubeWatchLaterIE(YoutubePlaylistIE):
3039 IE_NAME = 'youtube:watchlater'
3040 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
3041 _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'
3042
3043 _TESTS = [{
3044 'url': 'https://www.youtube.com/playlist?list=WL',
3045 'only_matching': True,
3046 }, {
3047 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',
3048 'only_matching': True,
3049 }]
3050
3051 def _real_extract(self, url):
3052 _, video = self._check_download_just_video(url, 'WL')
3053 if video:
3054 return video
3055 _, playlist = self._extract_playlist('WL')
3056 return playlist
3057
3058
3059 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
3060 IE_NAME = 'youtube:favorites'
3061 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
3062 _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
3063 _LOGIN_REQUIRED = True
3064
3065 def _real_extract(self, url):
3066 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
3067 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
3068 return self.url_result(playlist_id, 'YoutubePlaylist')
3069
3070
3071 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3072 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
3073 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'
3074 _FEED_NAME = 'recommended'
3075 _PLAYLIST_TITLE = 'Youtube Recommended videos'
3076
3077
3078 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
3079 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
3080 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
3081 _FEED_NAME = 'subscriptions'
3082 _PLAYLIST_TITLE = 'Youtube Subscriptions'
3083
3084
3085 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
3086 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
3087 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
3088 _FEED_NAME = 'history'
3089 _PLAYLIST_TITLE = 'Youtube History'
3090
3091
3092 class YoutubeTruncatedURLIE(InfoExtractor):
3093 IE_NAME = 'youtube:truncated_url'
3094 IE_DESC = False # Do not list
3095 _VALID_URL = r'''(?x)
3096 (?:https?://)?
3097 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3098 (?:watch\?(?:
3099 feature=[a-z_]+|
3100 annotation_id=annotation_[^&]+|
3101 x-yt-cl=[0-9]+|
3102 hl=[^&]*|
3103 t=[0-9]+
3104 )?
3105 |
3106 attribution_link\?a=[^&]+
3107 )
3108 $
3109 '''
3110
3111 _TESTS = [{
3112 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
3113 'only_matching': True,
3114 }, {
3115 'url': 'https://www.youtube.com/watch?',
3116 'only_matching': True,
3117 }, {
3118 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3119 'only_matching': True,
3120 }, {
3121 'url': 'https://www.youtube.com/watch?feature=foo',
3122 'only_matching': True,
3123 }, {
3124 'url': 'https://www.youtube.com/watch?hl=en-GB',
3125 'only_matching': True,
3126 }, {
3127 'url': 'https://www.youtube.com/watch?t=2372',
3128 'only_matching': True,
3129 }]
3130
3131 def _real_extract(self, url):
3132 raise ExtractorError(
3133 'Did you forget to quote the URL? Remember that & is a meta '
3134 'character in most shells, so you want to put the URL in quotes, '
3135 'like youtube-dl '
3136 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3137 ' or simply youtube-dl BaW_jenozKc .',
3138 expected=True)
3139
3140
3141 class YoutubeTruncatedIDIE(InfoExtractor):
3142 IE_NAME = 'youtube:truncated_id'
3143 IE_DESC = False # Do not list
3144 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
3145
3146 _TESTS = [{
3147 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3148 'only_matching': True,
3149 }]
3150
3151 def _real_extract(self, url):
3152 video_id = self._match_id(url)
3153 raise ExtractorError(
3154 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3155 expected=True)