]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/youtube.py
[youtube:playlist] Fix test title
[yt-dlp.git] / youtube_dl / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5
6 import itertools
7 import json
8 import os.path
9 import re
10 import traceback
11
12 from .common import InfoExtractor, SearchInfoExtractor
13 from .subtitles import SubtitlesInfoExtractor
14 from ..jsinterp import JSInterpreter
15 from ..swfinterp import SWFInterpreter
16 from ..utils import (
17 compat_chr,
18 compat_parse_qs,
19 compat_urllib_parse,
20 compat_urllib_request,
21 compat_urlparse,
22 compat_str,
23
24 clean_html,
25 get_element_by_id,
26 get_element_by_attribute,
27 ExtractorError,
28 int_or_none,
29 OnDemandPagedList,
30 unescapeHTML,
31 unified_strdate,
32 orderedSet,
33 uppercase_escape,
34 )
35
36 class YoutubeBaseInfoExtractor(InfoExtractor):
37 """Provide base functions for Youtube extractors"""
38 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
39 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
40 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
41 _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
45
46 def _set_language(self):
47 return bool(self._download_webpage(
48 self._LANG_URL, None,
49 note='Setting language', errnote='unable to set language',
50 fatal=False))
51
52 def _login(self):
53 """
54 Attempt to log in to YouTube.
55 True is returned if successful or skipped.
56 False is returned if login failed.
57
58 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
59 """
60 (username, password) = self._get_login_info()
61 # No authentication to be performed
62 if username is None:
63 if self._LOGIN_REQUIRED:
64 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
65 return True
66
67 login_page = self._download_webpage(
68 self._LOGIN_URL, None,
69 note='Downloading login page',
70 errnote='unable to fetch login page', fatal=False)
71 if login_page is False:
72 return
73
74 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
75 login_page, 'Login GALX parameter')
76
77 # Log in
78 login_form_strs = {
79 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
80 'Email': username,
81 'GALX': galx,
82 'Passwd': password,
83
84 'PersistentCookie': 'yes',
85 '_utf8': '霱',
86 'bgresponse': 'js_disabled',
87 'checkConnection': '',
88 'checkedDomains': 'youtube',
89 'dnConn': '',
90 'pstMsg': '0',
91 'rmShown': '1',
92 'secTok': '',
93 'signIn': 'Sign in',
94 'timeStmp': '',
95 'service': 'youtube',
96 'uilel': '3',
97 'hl': 'en_US',
98 }
99
100 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
101 # chokes on unicode
102 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
103 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
104
105 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
106 login_results = self._download_webpage(
107 req, None,
108 note='Logging in', errnote='unable to log in', fatal=False)
109 if login_results is False:
110 return False
111
112 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
113 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
114
115 # Two-Factor
116 # TODO add SMS and phone call support - these require making a request and then prompting the user
117
118 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
119 tfa_code = self._get_tfa_info()
120
121 if tfa_code is None:
122 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
123 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
124 return False
125
126 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
127
128 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
129 if match is None:
130 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
131 secTok = match.group(1)
132 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
133 if match is None:
134 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
135 timeStmp = match.group(1)
136
137 tfa_form_strs = {
138 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
139 'smsToken': '',
140 'smsUserPin': tfa_code,
141 'smsVerifyPin': 'Verify',
142
143 'PersistentCookie': 'yes',
144 'checkConnection': '',
145 'checkedDomains': 'youtube',
146 'pstMsg': '1',
147 'secTok': secTok,
148 'timeStmp': timeStmp,
149 'service': 'youtube',
150 'hl': 'en_US',
151 }
152 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in tfa_form_strs.items())
153 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
154
155 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
156 tfa_results = self._download_webpage(
157 tfa_req, None,
158 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
159
160 if tfa_results is False:
161 return False
162
163 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
164 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
165 return False
166 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
167 self._downloader.report_warning('unable to log in - did the page structure change?')
168 return False
169 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
170 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
171 return False
172
173 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
174 self._downloader.report_warning('unable to log in: bad username or password')
175 return False
176 return True
177
178 def _confirm_age(self):
179 age_form = {
180 'next_url': '/',
181 'action_confirm': 'Confirm',
182 }
183 req = compat_urllib_request.Request(self._AGE_URL,
184 compat_urllib_parse.urlencode(age_form).encode('ascii'))
185
186 self._download_webpage(
187 req, None,
188 note='Confirming age', errnote='Unable to confirm age',
189 fatal=False)
190
191 def _real_initialize(self):
192 if self._downloader is None:
193 return
194 if self._get_login_info()[0] is not None:
195 if not self._set_language():
196 return
197 if not self._login():
198 return
199 self._confirm_age()
200
201
202 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
203 IE_DESC = 'YouTube.com'
204 _VALID_URL = r"""(?x)^
205 (
206 (?:https?://|//) # http(s):// or protocol-independent URL
207 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
208 (?:www\.)?deturl\.com/www\.youtube\.com/|
209 (?:www\.)?pwnyoutube\.com/|
210 (?:www\.)?yourepeat\.com/|
211 tube\.majestyc\.net/|
212 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
213 (?:.*?\#/)? # handle anchor (#/) redirect urls
214 (?: # the various things that can precede the ID:
215 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
216 |(?: # or the v= param in all its forms
217 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
218 (?:\?|\#!?) # the params delimiter ? or # or #!
219 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
220 v=
221 )
222 ))
223 |youtu\.be/ # just youtu.be/xxxx
224 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
225 )
226 )? # all until now is optional -> you can pass the naked ID
227 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
228 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
229 (?(1).+)? # if we found the ID, everything can follow
230 $"""
231 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
232 _formats = {
233 '5': {'ext': 'flv', 'width': 400, 'height': 240},
234 '6': {'ext': 'flv', 'width': 450, 'height': 270},
235 '13': {'ext': '3gp'},
236 '17': {'ext': '3gp', 'width': 176, 'height': 144},
237 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
238 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
239 '34': {'ext': 'flv', 'width': 640, 'height': 360},
240 '35': {'ext': 'flv', 'width': 854, 'height': 480},
241 '36': {'ext': '3gp', 'width': 320, 'height': 240},
242 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
243 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
244 '43': {'ext': 'webm', 'width': 640, 'height': 360},
245 '44': {'ext': 'webm', 'width': 854, 'height': 480},
246 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
247 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
248
249
250 # 3d videos
251 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
252 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
253 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
254 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
255 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
256 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
257 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
258
259 # Apple HTTP Live Streaming
260 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
261 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
262 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
263 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
264 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
265 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
266 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
267
268 # DASH mp4 video
269 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
270 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
271 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
272 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
273 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
274 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
275 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
276 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
277
278 # Dash mp4 audio
279 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
280 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
281 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
282
283 # Dash webm
284 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
285 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
286 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
287 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
288 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
289 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
290 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
291 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
292 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
293 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
294 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
295 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
296 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
297 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
298 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
299 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
300
301 # Dash webm audio
302 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
303 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
304
305 # RTMP (unnamed)
306 '_rtmp': {'protocol': 'rtmp'},
307 }
308
309 IE_NAME = 'youtube'
310 _TESTS = [
311 {
312 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
313 'info_dict': {
314 'id': 'BaW_jenozKc',
315 'ext': 'mp4',
316 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
317 'uploader': 'Philipp Hagemeister',
318 'uploader_id': 'phihag',
319 'upload_date': '20121002',
320 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
321 'categories': ['Science & Technology'],
322 'like_count': int,
323 'dislike_count': int,
324 }
325 },
326 {
327 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
328 'note': 'Test generic use_cipher_signature video (#897)',
329 'info_dict': {
330 'id': 'UxxajLWwzqY',
331 'ext': 'mp4',
332 'upload_date': '20120506',
333 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
334 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
335 'uploader': 'Icona Pop',
336 'uploader_id': 'IconaPop',
337 }
338 },
339 {
340 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
341 'note': 'Test VEVO video with age protection (#956)',
342 'info_dict': {
343 'id': '07FYdnEawAQ',
344 'ext': 'mp4',
345 'upload_date': '20130703',
346 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
347 'description': 'md5:64249768eec3bc4276236606ea996373',
348 'uploader': 'justintimberlakeVEVO',
349 'uploader_id': 'justintimberlakeVEVO',
350 }
351 },
352 {
353 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
354 'note': 'Embed-only video (#1746)',
355 'info_dict': {
356 'id': 'yZIXLfi8CZQ',
357 'ext': 'mp4',
358 'upload_date': '20120608',
359 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
360 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
361 'uploader': 'SET India',
362 'uploader_id': 'setindia'
363 }
364 },
365 {
366 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
367 'note': '256k DASH audio (format 141) via DASH manifest',
368 'info_dict': {
369 'id': 'a9LDPn-MO4I',
370 'ext': 'm4a',
371 'upload_date': '20121002',
372 'uploader_id': '8KVIDEO',
373 'description': '',
374 'uploader': '8KVIDEO',
375 'title': 'UHDTV TEST 8K VIDEO.mp4'
376 },
377 'params': {
378 'youtube_include_dash_manifest': True,
379 'format': '141',
380 },
381 },
382 # DASH manifest with encrypted signature
383 {
384 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
385 'info_dict': {
386 'id': 'IB3lcPjvWLA',
387 'ext': 'm4a',
388 'title': 'Afrojack - The Spark ft. Spree Wilson',
389 'description': 'md5:9717375db5a9a3992be4668bbf3bc0a8',
390 'uploader': 'AfrojackVEVO',
391 'uploader_id': 'AfrojackVEVO',
392 'upload_date': '20131011',
393 },
394 'params': {
395 'youtube_include_dash_manifest': True,
396 'format': '141',
397 },
398 },
399 ]
400
401 def __init__(self, *args, **kwargs):
402 super(YoutubeIE, self).__init__(*args, **kwargs)
403 self._player_cache = {}
404
405 def report_video_info_webpage_download(self, video_id):
406 """Report attempt to download video info webpage."""
407 self.to_screen('%s: Downloading video info webpage' % video_id)
408
409 def report_information_extraction(self, video_id):
410 """Report attempt to extract video information."""
411 self.to_screen('%s: Extracting video information' % video_id)
412
413 def report_unavailable_format(self, video_id, format):
414 """Report extracted video URL."""
415 self.to_screen('%s: Format %s not available' % (video_id, format))
416
417 def report_rtmp_download(self):
418 """Indicate the download will use the RTMP protocol."""
419 self.to_screen('RTMP download detected')
420
421 def _signature_cache_id(self, example_sig):
422 """ Return a string representation of a signature """
423 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
424
425 def _extract_signature_function(self, video_id, player_url, example_sig):
426 id_m = re.match(
427 r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
428 player_url)
429 if not id_m:
430 raise ExtractorError('Cannot identify player %r' % player_url)
431 player_type = id_m.group('ext')
432 player_id = id_m.group('id')
433
434 # Read from filesystem cache
435 func_id = '%s_%s_%s' % (
436 player_type, player_id, self._signature_cache_id(example_sig))
437 assert os.path.basename(func_id) == func_id
438
439 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
440 if cache_spec is not None:
441 return lambda s: ''.join(s[i] for i in cache_spec)
442
443 if player_type == 'js':
444 code = self._download_webpage(
445 player_url, video_id,
446 note='Downloading %s player %s' % (player_type, player_id),
447 errnote='Download of %s failed' % player_url)
448 res = self._parse_sig_js(code)
449 elif player_type == 'swf':
450 urlh = self._request_webpage(
451 player_url, video_id,
452 note='Downloading %s player %s' % (player_type, player_id),
453 errnote='Download of %s failed' % player_url)
454 code = urlh.read()
455 res = self._parse_sig_swf(code)
456 else:
457 assert False, 'Invalid player type %r' % player_type
458
459 if cache_spec is None:
460 test_string = ''.join(map(compat_chr, range(len(example_sig))))
461 cache_res = res(test_string)
462 cache_spec = [ord(c) for c in cache_res]
463
464 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
465 return res
466
467 def _print_sig_code(self, func, example_sig):
468 def gen_sig_code(idxs):
469 def _genslice(start, end, step):
470 starts = '' if start == 0 else str(start)
471 ends = (':%d' % (end+step)) if end + step >= 0 else ':'
472 steps = '' if step == 1 else (':%d' % step)
473 return 's[%s%s%s]' % (starts, ends, steps)
474
475 step = None
476 start = '(Never used)' # Quelch pyflakes warnings - start will be
477 # set as soon as step is set
478 for i, prev in zip(idxs[1:], idxs[:-1]):
479 if step is not None:
480 if i - prev == step:
481 continue
482 yield _genslice(start, prev, step)
483 step = None
484 continue
485 if i - prev in [-1, 1]:
486 step = i - prev
487 start = prev
488 continue
489 else:
490 yield 's[%d]' % prev
491 if step is None:
492 yield 's[%d]' % i
493 else:
494 yield _genslice(start, i, step)
495
496 test_string = ''.join(map(compat_chr, range(len(example_sig))))
497 cache_res = func(test_string)
498 cache_spec = [ord(c) for c in cache_res]
499 expr_code = ' + '.join(gen_sig_code(cache_spec))
500 signature_id_tuple = '(%s)' % (
501 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
502 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
503 ' return %s\n') % (signature_id_tuple, expr_code)
504 self.to_screen('Extracted signature function:\n' + code)
505
506 def _parse_sig_js(self, jscode):
507 funcname = self._search_regex(
508 r'signature=([$a-zA-Z]+)', jscode,
509 'Initial JS player signature function name')
510
511 jsi = JSInterpreter(jscode)
512 initial_function = jsi.extract_function(funcname)
513 return lambda s: initial_function([s])
514
515 def _parse_sig_swf(self, file_contents):
516 swfi = SWFInterpreter(file_contents)
517 TARGET_CLASSNAME = 'SignatureDecipher'
518 searched_class = swfi.extract_class(TARGET_CLASSNAME)
519 initial_function = swfi.extract_function(searched_class, 'decipher')
520 return lambda s: initial_function([s])
521
522 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
523 """Turn the encrypted s field into a working signature"""
524
525 if player_url is None:
526 raise ExtractorError('Cannot decrypt signature without player_url')
527
528 if player_url.startswith('//'):
529 player_url = 'https:' + player_url
530 try:
531 player_id = (player_url, self._signature_cache_id(s))
532 if player_id not in self._player_cache:
533 func = self._extract_signature_function(
534 video_id, player_url, s
535 )
536 self._player_cache[player_id] = func
537 func = self._player_cache[player_id]
538 if self._downloader.params.get('youtube_print_sig_code'):
539 self._print_sig_code(func, s)
540 return func(s)
541 except Exception as e:
542 tb = traceback.format_exc()
543 raise ExtractorError(
544 'Signature extraction failed: ' + tb, cause=e)
545
546 def _get_available_subtitles(self, video_id, webpage):
547 try:
548 sub_list = self._download_webpage(
549 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
550 video_id, note=False)
551 except ExtractorError as err:
552 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
553 return {}
554 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
555
556 sub_lang_list = {}
557 for l in lang_list:
558 lang = l[1]
559 if lang in sub_lang_list:
560 continue
561 params = compat_urllib_parse.urlencode({
562 'lang': lang,
563 'v': video_id,
564 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
565 'name': unescapeHTML(l[0]).encode('utf-8'),
566 })
567 url = 'https://www.youtube.com/api/timedtext?' + params
568 sub_lang_list[lang] = url
569 if not sub_lang_list:
570 self._downloader.report_warning('video doesn\'t have subtitles')
571 return {}
572 return sub_lang_list
573
574 def _get_available_automatic_caption(self, video_id, webpage):
575 """We need the webpage for getting the captions url, pass it as an
576 argument to speed up the process."""
577 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
578 self.to_screen('%s: Looking for automatic captions' % video_id)
579 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
580 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
581 if mobj is None:
582 self._downloader.report_warning(err_msg)
583 return {}
584 player_config = json.loads(mobj.group(1))
585 try:
586 args = player_config[u'args']
587 caption_url = args[u'ttsurl']
588 timestamp = args[u'timestamp']
589 # We get the available subtitles
590 list_params = compat_urllib_parse.urlencode({
591 'type': 'list',
592 'tlangs': 1,
593 'asrs': 1,
594 })
595 list_url = caption_url + '&' + list_params
596 caption_list = self._download_xml(list_url, video_id)
597 original_lang_node = caption_list.find('track')
598 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
599 self._downloader.report_warning('Video doesn\'t have automatic captions')
600 return {}
601 original_lang = original_lang_node.attrib['lang_code']
602
603 sub_lang_list = {}
604 for lang_node in caption_list.findall('target'):
605 sub_lang = lang_node.attrib['lang_code']
606 params = compat_urllib_parse.urlencode({
607 'lang': original_lang,
608 'tlang': sub_lang,
609 'fmt': sub_format,
610 'ts': timestamp,
611 'kind': 'asr',
612 })
613 sub_lang_list[sub_lang] = caption_url + '&' + params
614 return sub_lang_list
615 # An extractor error can be raise by the download process if there are
616 # no automatic captions but there are subtitles
617 except (KeyError, ExtractorError):
618 self._downloader.report_warning(err_msg)
619 return {}
620
621 @classmethod
622 def extract_id(cls, url):
623 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
624 if mobj is None:
625 raise ExtractorError('Invalid URL: %s' % url)
626 video_id = mobj.group(2)
627 return video_id
628
629 def _extract_from_m3u8(self, manifest_url, video_id):
630 url_map = {}
631 def _get_urls(_manifest):
632 lines = _manifest.split('\n')
633 urls = filter(lambda l: l and not l.startswith('#'),
634 lines)
635 return urls
636 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
637 formats_urls = _get_urls(manifest)
638 for format_url in formats_urls:
639 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
640 url_map[itag] = format_url
641 return url_map
642
643 def _extract_annotations(self, video_id):
644 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
645 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
646
647 def _real_extract(self, url):
648 proto = (
649 'http' if self._downloader.params.get('prefer_insecure', False)
650 else 'https')
651
652 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
653 mobj = re.search(self._NEXT_URL_RE, url)
654 if mobj:
655 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
656 video_id = self.extract_id(url)
657
658 # Get video webpage
659 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
660 pref_cookies = [
661 c for c in self._downloader.cookiejar
662 if c.domain == '.youtube.com' and c.name == 'PREF']
663 for pc in pref_cookies:
664 if 'hl=' in pc.value:
665 pc.value = re.sub(r'hl=[^&]+', 'hl=en', pc.value)
666 else:
667 if pc.value:
668 pc.value += '&'
669 pc.value += 'hl=en'
670 video_webpage = self._download_webpage(url, video_id)
671
672 # Attempt to extract SWF player URL
673 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
674 if mobj is not None:
675 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
676 else:
677 player_url = None
678
679 # Get video info
680 self.report_video_info_webpage_download(video_id)
681 if re.search(r'player-age-gate-content">', video_webpage) is not None:
682 self.report_age_confirmation()
683 age_gate = True
684 # We simulate the access to the video from www.youtube.com/v/{video_id}
685 # this can be viewed without login into Youtube
686 data = compat_urllib_parse.urlencode({
687 'video_id': video_id,
688 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
689 'sts': self._search_regex(
690 r'"sts"\s*:\s*(\d+)', video_webpage, 'sts'),
691 })
692 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
693 video_info_webpage = self._download_webpage(video_info_url, video_id,
694 note=False,
695 errnote='unable to download video info webpage')
696 video_info = compat_parse_qs(video_info_webpage)
697 else:
698 age_gate = False
699 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
700 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
701 % (video_id, el_type))
702 video_info_webpage = self._download_webpage(video_info_url, video_id,
703 note=False,
704 errnote='unable to download video info webpage')
705 video_info = compat_parse_qs(video_info_webpage)
706 if 'token' in video_info:
707 break
708 if 'token' not in video_info:
709 if 'reason' in video_info:
710 raise ExtractorError(
711 'YouTube said: %s' % video_info['reason'][0],
712 expected=True, video_id=video_id)
713 else:
714 raise ExtractorError(
715 '"token" parameter not in video info for unknown reason',
716 video_id=video_id)
717
718 if 'view_count' in video_info:
719 view_count = int(video_info['view_count'][0])
720 else:
721 view_count = None
722
723 # Check for "rental" videos
724 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
725 raise ExtractorError('"rental" videos not supported')
726
727 # Start extracting information
728 self.report_information_extraction(video_id)
729
730 # uploader
731 if 'author' not in video_info:
732 raise ExtractorError('Unable to extract uploader name')
733 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
734
735 # uploader_id
736 video_uploader_id = None
737 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
738 if mobj is not None:
739 video_uploader_id = mobj.group(1)
740 else:
741 self._downloader.report_warning('unable to extract uploader nickname')
742
743 # title
744 if 'title' in video_info:
745 video_title = video_info['title'][0]
746 else:
747 self._downloader.report_warning('Unable to extract video title')
748 video_title = '_'
749
750 # thumbnail image
751 # We try first to get a high quality image:
752 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
753 video_webpage, re.DOTALL)
754 if m_thumb is not None:
755 video_thumbnail = m_thumb.group(1)
756 elif 'thumbnail_url' not in video_info:
757 self._downloader.report_warning('unable to extract video thumbnail')
758 video_thumbnail = None
759 else: # don't panic if we can't find it
760 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
761
762 # upload date
763 upload_date = None
764 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
765 if mobj is None:
766 mobj = re.search(
767 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
768 video_webpage)
769 if mobj is not None:
770 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
771 upload_date = unified_strdate(upload_date)
772
773 m_cat_container = self._search_regex(
774 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
775 video_webpage, 'categories', fatal=False)
776 if m_cat_container:
777 category = self._html_search_regex(
778 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
779 default=None)
780 video_categories = None if category is None else [category]
781 else:
782 video_categories = None
783
784 # description
785 video_description = get_element_by_id("eow-description", video_webpage)
786 if video_description:
787 video_description = re.sub(r'''(?x)
788 <a\s+
789 (?:[a-zA-Z-]+="[^"]+"\s+)*?
790 title="([^"]+)"\s+
791 (?:[a-zA-Z-]+="[^"]+"\s+)*?
792 class="yt-uix-redirect-link"\s*>
793 [^<]+
794 </a>
795 ''', r'\1', video_description)
796 video_description = clean_html(video_description)
797 else:
798 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
799 if fd_mobj:
800 video_description = unescapeHTML(fd_mobj.group(1))
801 else:
802 video_description = ''
803
804 def _extract_count(count_name):
805 count = self._search_regex(
806 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
807 video_webpage, count_name, default=None)
808 if count is not None:
809 return int(count.replace(',', ''))
810 return None
811 like_count = _extract_count('like')
812 dislike_count = _extract_count('dislike')
813
814 # subtitles
815 video_subtitles = self.extract_subtitles(video_id, video_webpage)
816
817 if self._downloader.params.get('listsubtitles', False):
818 self._list_available_subtitles(video_id, video_webpage)
819 return
820
821 if 'length_seconds' not in video_info:
822 self._downloader.report_warning('unable to extract video duration')
823 video_duration = None
824 else:
825 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
826
827 # annotations
828 video_annotations = None
829 if self._downloader.params.get('writeannotations', False):
830 video_annotations = self._extract_annotations(video_id)
831
832 # Decide which formats to download
833 try:
834 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
835 if not mobj:
836 raise ValueError('Could not find vevo ID')
837 json_code = uppercase_escape(mobj.group(1))
838 ytplayer_config = json.loads(json_code)
839 args = ytplayer_config['args']
840 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
841 # this signatures are encrypted
842 if 'url_encoded_fmt_stream_map' not in args:
843 raise ValueError('No stream_map present') # caught below
844 re_signature = re.compile(r'[&,]s=')
845 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
846 if m_s is not None:
847 self.to_screen('%s: Encrypted signatures detected.' % video_id)
848 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
849 m_s = re_signature.search(args.get('adaptive_fmts', ''))
850 if m_s is not None:
851 if 'adaptive_fmts' in video_info:
852 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
853 else:
854 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
855 except ValueError:
856 pass
857
858 def _map_to_format_list(urlmap):
859 formats = []
860 for itag, video_real_url in urlmap.items():
861 dct = {
862 'format_id': itag,
863 'url': video_real_url,
864 'player_url': player_url,
865 }
866 if itag in self._formats:
867 dct.update(self._formats[itag])
868 formats.append(dct)
869 return formats
870
871 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
872 self.report_rtmp_download()
873 formats = [{
874 'format_id': '_rtmp',
875 'protocol': 'rtmp',
876 'url': video_info['conn'][0],
877 'player_url': player_url,
878 }]
879 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
880 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
881 if 'rtmpe%3Dyes' in encoded_url_map:
882 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
883 url_map = {}
884 for url_data_str in encoded_url_map.split(','):
885 url_data = compat_parse_qs(url_data_str)
886 if 'itag' not in url_data or 'url' not in url_data:
887 continue
888 format_id = url_data['itag'][0]
889 url = url_data['url'][0]
890
891 if 'sig' in url_data:
892 url += '&signature=' + url_data['sig'][0]
893 elif 's' in url_data:
894 encrypted_sig = url_data['s'][0]
895
896 if not age_gate:
897 jsplayer_url_json = self._search_regex(
898 r'"assets":.+?"js":\s*("[^"]+")',
899 video_webpage, 'JS player URL')
900 player_url = json.loads(jsplayer_url_json)
901 if player_url is None:
902 player_url_json = self._search_regex(
903 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
904 video_webpage, 'age gate player URL')
905 player_url = json.loads(player_url_json)
906
907 if self._downloader.params.get('verbose'):
908 if player_url is None:
909 player_version = 'unknown'
910 player_desc = 'unknown'
911 else:
912 if player_url.endswith('swf'):
913 player_version = self._search_regex(
914 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
915 'flash player', fatal=False)
916 player_desc = 'flash player %s' % player_version
917 else:
918 player_version = self._search_regex(
919 r'html5player-([^/]+?)(?:/html5player)?\.js',
920 player_url,
921 'html5 player', fatal=False)
922 player_desc = 'html5 player %s' % player_version
923
924 parts_sizes = self._signature_cache_id(encrypted_sig)
925 self.to_screen('{%s} signature length %s, %s' %
926 (format_id, parts_sizes, player_desc))
927
928 signature = self._decrypt_signature(
929 encrypted_sig, video_id, player_url, age_gate)
930 url += '&signature=' + signature
931 if 'ratebypass' not in url:
932 url += '&ratebypass=yes'
933 url_map[format_id] = url
934 formats = _map_to_format_list(url_map)
935 elif video_info.get('hlsvp'):
936 manifest_url = video_info['hlsvp'][0]
937 url_map = self._extract_from_m3u8(manifest_url, video_id)
938 formats = _map_to_format_list(url_map)
939 else:
940 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
941
942 # Look for the DASH manifest
943 if self._downloader.params.get('youtube_include_dash_manifest', True):
944 try:
945 # The DASH manifest used needs to be the one from the original video_webpage.
946 # The one found in get_video_info seems to be using different signatures.
947 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
948 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
949 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
950 if age_gate:
951 dash_manifest_url = video_info.get('dashmpd')[0]
952 else:
953 dash_manifest_url = ytplayer_config['args']['dashmpd']
954 def decrypt_sig(mobj):
955 s = mobj.group(1)
956 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
957 return '/signature/%s' % dec_s
958 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
959 dash_doc = self._download_xml(
960 dash_manifest_url, video_id,
961 note='Downloading DASH manifest',
962 errnote='Could not download DASH manifest')
963 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
964 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
965 if url_el is None:
966 continue
967 format_id = r.attrib['id']
968 video_url = url_el.text
969 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
970 f = {
971 'format_id': format_id,
972 'url': video_url,
973 'width': int_or_none(r.attrib.get('width')),
974 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
975 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
976 'filesize': filesize,
977 }
978 try:
979 existing_format = next(
980 fo for fo in formats
981 if fo['format_id'] == format_id)
982 except StopIteration:
983 f.update(self._formats.get(format_id, {}))
984 formats.append(f)
985 else:
986 existing_format.update(f)
987
988 except (ExtractorError, KeyError) as e:
989 self.report_warning('Skipping DASH manifest: %s' % e, video_id)
990
991 self._sort_formats(formats)
992
993 return {
994 'id': video_id,
995 'uploader': video_uploader,
996 'uploader_id': video_uploader_id,
997 'upload_date': upload_date,
998 'title': video_title,
999 'thumbnail': video_thumbnail,
1000 'description': video_description,
1001 'categories': video_categories,
1002 'subtitles': video_subtitles,
1003 'duration': video_duration,
1004 'age_limit': 18 if age_gate else 0,
1005 'annotations': video_annotations,
1006 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1007 'view_count': view_count,
1008 'like_count': like_count,
1009 'dislike_count': dislike_count,
1010 'formats': formats,
1011 }
1012
1013 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1014 IE_DESC = 'YouTube.com playlists'
1015 _VALID_URL = r"""(?x)(?:
1016 (?:https?://)?
1017 (?:\w+\.)?
1018 youtube\.com/
1019 (?:
1020 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
1021 \? (?:.*?&)*? (?:p|a|list)=
1022 | p/
1023 )
1024 (
1025 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1026 # Top tracks, they can also include dots
1027 |(?:MC)[\w\.]*
1028 )
1029 .*
1030 |
1031 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1032 )"""
1033 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1034 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1035 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1036 IE_NAME = 'youtube:playlist'
1037 _TESTS = [{
1038 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1039 'info_dict': {
1040 'title': 'ytdl test PL',
1041 },
1042 'playlist_count': 3,
1043 }, {
1044 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1045 'info_dict': {
1046 'title': 'YDL_Empty_List',
1047 },
1048 'playlist_count': 0,
1049 }, {
1050 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1051 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1052 'info_dict': {
1053 'title': '29C3: Not my department',
1054 },
1055 'playlist_count': 95,
1056 }, {
1057 'note': 'issue #673',
1058 'url': 'PLBB231211A4F62143',
1059 'info_dict': {
1060 'title': '[OLD]Team Fortress 2 (Class-based LP)',
1061 },
1062 'playlist_mincount': 26,
1063 }, {
1064 'note': 'Large playlist',
1065 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1066 'info_dict': {
1067 'title': 'Uploads from Cauchemar',
1068 },
1069 'playlist_mincount': 799,
1070 }, {
1071 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1072 'info_dict': {
1073 'title': 'YDL_safe_search',
1074 },
1075 'playlist_count': 2,
1076 }, {
1077 'note': 'embedded',
1078 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1079 'playlist_count': 4,
1080 'info_dict': {
1081 'title': 'JODA15',
1082 }
1083 }, {
1084 'note': 'Embedded SWF player',
1085 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1086 'playlist_count': 4,
1087 'info_dict': {
1088 'title': 'JODA7',
1089 }
1090 }]
1091
1092 def _real_initialize(self):
1093 self._login()
1094
1095 def _ids_to_results(self, ids):
1096 return [
1097 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1098 for vid_id in ids]
1099
1100 def _extract_mix(self, playlist_id):
1101 # The mixes are generated from a a single video
1102 # the id of the playlist is just 'RD' + video_id
1103 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1104 webpage = self._download_webpage(
1105 url, playlist_id, 'Downloading Youtube mix')
1106 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1107 title_span = (
1108 search_title('playlist-title') or
1109 search_title('title long-title') or
1110 search_title('title'))
1111 title = clean_html(title_span)
1112 ids = orderedSet(re.findall(
1113 r'''(?xs)data-video-username=".*?".*?
1114 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1115 webpage))
1116 url_results = self._ids_to_results(ids)
1117
1118 return self.playlist_result(url_results, playlist_id, title)
1119
1120 def _real_extract(self, url):
1121 # Extract playlist id
1122 mobj = re.match(self._VALID_URL, url)
1123 if mobj is None:
1124 raise ExtractorError('Invalid URL: %s' % url)
1125 playlist_id = mobj.group(1) or mobj.group(2)
1126
1127 # Check if it's a video-specific URL
1128 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1129 if 'v' in query_dict:
1130 video_id = query_dict['v'][0]
1131 if self._downloader.params.get('noplaylist'):
1132 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1133 return self.url_result(video_id, 'Youtube', video_id=video_id)
1134 else:
1135 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1136
1137 if playlist_id.startswith('RD'):
1138 # Mixes require a custom extraction process
1139 return self._extract_mix(playlist_id)
1140 if playlist_id.startswith('TL'):
1141 raise ExtractorError('For downloading YouTube.com top lists, use '
1142 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1143
1144 url = self._TEMPLATE_URL % playlist_id
1145 page = self._download_webpage(url, playlist_id)
1146 more_widget_html = content_html = page
1147
1148 # Check if the playlist exists or is private
1149 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1150 raise ExtractorError(
1151 'The playlist doesn\'t exist or is private, use --username or '
1152 '--netrc to access it.',
1153 expected=True)
1154
1155 # Extract the video ids from the playlist pages
1156 ids = []
1157
1158 for page_num in itertools.count(1):
1159 matches = re.finditer(self._VIDEO_RE, content_html)
1160 # We remove the duplicates and the link with index 0
1161 # (it's not the first video of the playlist)
1162 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1163 ids.extend(new_ids)
1164
1165 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1166 if not mobj:
1167 break
1168
1169 more = self._download_json(
1170 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1171 'Downloading page #%s' % page_num,
1172 transform_source=uppercase_escape)
1173 content_html = more['content_html']
1174 more_widget_html = more['load_more_widget_html']
1175
1176 playlist_title = self._html_search_regex(
1177 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1178 page, 'title')
1179
1180 url_results = self._ids_to_results(ids)
1181 return self.playlist_result(url_results, playlist_id, playlist_title)
1182
1183
1184 class YoutubeTopListIE(YoutubePlaylistIE):
1185 IE_NAME = 'youtube:toplist'
1186 IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1187 ' (Example: "yttoplist:music:Top Tracks")')
1188 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1189 _TESTS = [{
1190 'url': 'yttoplist:music:Trending',
1191 'playlist_mincount': 5,
1192 'skip': 'Only works for logged-in users',
1193 }]
1194
1195 def _real_extract(self, url):
1196 mobj = re.match(self._VALID_URL, url)
1197 channel = mobj.group('chann')
1198 title = mobj.group('title')
1199 query = compat_urllib_parse.urlencode({'title': title})
1200 channel_page = self._download_webpage(
1201 'https://www.youtube.com/%s' % channel, title)
1202 link = self._html_search_regex(
1203 r'''(?x)
1204 <a\s+href="([^"]+)".*?>\s*
1205 <span\s+class="branded-page-module-title-text">\s*
1206 <span[^>]*>.*?%s.*?</span>''' % re.escape(query),
1207 channel_page, 'list')
1208 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1209
1210 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1211 ids = []
1212 # sometimes the webpage doesn't contain the videos
1213 # retry until we get them
1214 for i in itertools.count(0):
1215 msg = 'Downloading Youtube mix'
1216 if i > 0:
1217 msg += ', retry #%d' % i
1218
1219 webpage = self._download_webpage(url, title, msg)
1220 ids = orderedSet(re.findall(video_re, webpage))
1221 if ids:
1222 break
1223 url_results = self._ids_to_results(ids)
1224 return self.playlist_result(url_results, playlist_title=title)
1225
1226
1227 class YoutubeChannelIE(InfoExtractor):
1228 IE_DESC = 'YouTube.com channels'
1229 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1230 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1231 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1232 IE_NAME = 'youtube:channel'
1233 _TESTS = [{
1234 'note': 'paginated channel',
1235 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1236 'playlist_mincount': 91,
1237 }]
1238
1239 def extract_videos_from_page(self, page):
1240 ids_in_page = []
1241 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1242 if mobj.group(1) not in ids_in_page:
1243 ids_in_page.append(mobj.group(1))
1244 return ids_in_page
1245
1246 def _real_extract(self, url):
1247 # Extract channel id
1248 mobj = re.match(self._VALID_URL, url)
1249 if mobj is None:
1250 raise ExtractorError('Invalid URL: %s' % url)
1251
1252 # Download channel page
1253 channel_id = mobj.group(1)
1254 video_ids = []
1255 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1256 channel_page = self._download_webpage(url, channel_id)
1257 autogenerated = re.search(r'''(?x)
1258 class="[^"]*?(?:
1259 channel-header-autogenerated-label|
1260 yt-channel-title-autogenerated
1261 )[^"]*"''', channel_page) is not None
1262
1263 if autogenerated:
1264 # The videos are contained in a single page
1265 # the ajax pages can't be used, they are empty
1266 video_ids = self.extract_videos_from_page(channel_page)
1267 else:
1268 # Download all channel pages using the json-based channel_ajax query
1269 for pagenum in itertools.count(1):
1270 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1271 page = self._download_json(
1272 url, channel_id, note='Downloading page #%s' % pagenum,
1273 transform_source=uppercase_escape)
1274
1275 ids_in_page = self.extract_videos_from_page(page['content_html'])
1276 video_ids.extend(ids_in_page)
1277
1278 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1279 break
1280
1281 self._downloader.to_screen('[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1282
1283 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1284 for video_id in video_ids]
1285 return self.playlist_result(url_entries, channel_id)
1286
1287
1288 class YoutubeUserIE(InfoExtractor):
1289 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
1290 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1291 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1292 _GDATA_PAGE_SIZE = 50
1293 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1294 IE_NAME = 'youtube:user'
1295
1296 _TESTS = [{
1297 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1298 'playlist_mincount': 320,
1299 'info_dict': {
1300 'title': 'TheLinuxFoundation',
1301 }
1302 }, {
1303 'url': 'ytuser:phihag',
1304 'only_matching': True,
1305 }]
1306
1307 @classmethod
1308 def suitable(cls, url):
1309 # Don't return True if the url can be extracted with other youtube
1310 # extractor, the regex would is too permissive and it would match.
1311 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1312 if any(ie.suitable(url) for ie in other_ies): return False
1313 else: return super(YoutubeUserIE, cls).suitable(url)
1314
1315 def _real_extract(self, url):
1316 # Extract username
1317 mobj = re.match(self._VALID_URL, url)
1318 if mobj is None:
1319 raise ExtractorError('Invalid URL: %s' % url)
1320
1321 username = mobj.group(1)
1322
1323 # Download video ids using YouTube Data API. Result size per
1324 # query is limited (currently to 50 videos) so we need to query
1325 # page by page until there are no video ids - it means we got
1326 # all of them.
1327
1328 def download_page(pagenum):
1329 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1330
1331 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1332 page = self._download_webpage(
1333 gdata_url, username,
1334 'Downloading video ids from %d to %d' % (
1335 start_index, start_index + self._GDATA_PAGE_SIZE))
1336
1337 try:
1338 response = json.loads(page)
1339 except ValueError as err:
1340 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
1341 if 'entry' not in response['feed']:
1342 return
1343
1344 # Extract video identifiers
1345 entries = response['feed']['entry']
1346 for entry in entries:
1347 title = entry['title']['$t']
1348 video_id = entry['id']['$t'].split('/')[-1]
1349 yield {
1350 '_type': 'url',
1351 'url': video_id,
1352 'ie_key': 'Youtube',
1353 'id': video_id,
1354 'title': title,
1355 }
1356 url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
1357
1358 return self.playlist_result(url_results, playlist_title=username)
1359
1360
1361 class YoutubeSearchIE(SearchInfoExtractor):
1362 IE_DESC = 'YouTube.com searches'
1363 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1364 _MAX_RESULTS = 1000
1365 IE_NAME = 'youtube:search'
1366 _SEARCH_KEY = 'ytsearch'
1367
1368 def _get_n_results(self, query, n):
1369 """Get a specified number of results for a query"""
1370
1371 video_ids = []
1372 pagenum = 0
1373 limit = n
1374 PAGE_SIZE = 50
1375
1376 while (PAGE_SIZE * pagenum) < limit:
1377 result_url = self._API_URL % (
1378 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1379 (PAGE_SIZE * pagenum) + 1)
1380 data_json = self._download_webpage(
1381 result_url, video_id='query "%s"' % query,
1382 note='Downloading page %s' % (pagenum + 1),
1383 errnote='Unable to download API page')
1384 data = json.loads(data_json)
1385 api_response = data['data']
1386
1387 if 'items' not in api_response:
1388 raise ExtractorError(
1389 '[youtube] No video results', expected=True)
1390
1391 new_ids = list(video['id'] for video in api_response['items'])
1392 video_ids += new_ids
1393
1394 limit = min(n, api_response['totalItems'])
1395 pagenum += 1
1396
1397 if len(video_ids) > n:
1398 video_ids = video_ids[:n]
1399 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1400 for video_id in video_ids]
1401 return self.playlist_result(videos, query)
1402
1403
1404 class YoutubeSearchDateIE(YoutubeSearchIE):
1405 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1406 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1407 _SEARCH_KEY = 'ytsearchdate'
1408 IE_DESC = 'YouTube.com searches, newest videos first'
1409
1410
1411 class YoutubeSearchURLIE(InfoExtractor):
1412 IE_DESC = 'YouTube.com search URLs'
1413 IE_NAME = 'youtube:search_url'
1414 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1415 _TESTS = [{
1416 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1417 'playlist_mincount': 5,
1418 'info_dict': {
1419 'title': 'youtube-dl test video',
1420 }
1421 }]
1422
1423 def _real_extract(self, url):
1424 mobj = re.match(self._VALID_URL, url)
1425 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1426
1427 webpage = self._download_webpage(url, query)
1428 result_code = self._search_regex(
1429 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
1430
1431 part_codes = re.findall(
1432 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1433 entries = []
1434 for part_code in part_codes:
1435 part_title = self._html_search_regex(
1436 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1437 part_url_snippet = self._html_search_regex(
1438 r'(?s)href="([^"]+)"', part_code, 'item URL')
1439 part_url = compat_urlparse.urljoin(
1440 'https://www.youtube.com/', part_url_snippet)
1441 entries.append({
1442 '_type': 'url',
1443 'url': part_url,
1444 'title': part_title,
1445 })
1446
1447 return {
1448 '_type': 'playlist',
1449 'entries': entries,
1450 'title': query,
1451 }
1452
1453
1454 class YoutubeShowIE(InfoExtractor):
1455 IE_DESC = 'YouTube.com (multi-season) shows'
1456 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
1457 IE_NAME = 'youtube:show'
1458 _TESTS = [{
1459 'url': 'http://www.youtube.com/show/airdisasters',
1460 'playlist_mincount': 3,
1461 'info_dict': {
1462 'id': 'airdisasters',
1463 'title': 'Air Disasters',
1464 }
1465 }]
1466
1467 def _real_extract(self, url):
1468 mobj = re.match(self._VALID_URL, url)
1469 playlist_id = mobj.group('id')
1470 webpage = self._download_webpage(
1471 url, playlist_id, 'Downloading show webpage')
1472 # There's one playlist for each season of the show
1473 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1474 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1475 entries = [
1476 self.url_result(
1477 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1478 for season in m_seasons
1479 ]
1480 title = self._og_search_title(webpage, fatal=False)
1481
1482 return {
1483 '_type': 'playlist',
1484 'id': playlist_id,
1485 'title': title,
1486 'entries': entries,
1487 }
1488
1489
1490 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1491 """
1492 Base class for extractors that fetch info from
1493 http://www.youtube.com/feed_ajax
1494 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1495 """
1496 _LOGIN_REQUIRED = True
1497 # use action_load_personal_feed instead of action_load_system_feed
1498 _PERSONAL_FEED = False
1499
1500 @property
1501 def _FEED_TEMPLATE(self):
1502 action = 'action_load_system_feed'
1503 if self._PERSONAL_FEED:
1504 action = 'action_load_personal_feed'
1505 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1506
1507 @property
1508 def IE_NAME(self):
1509 return 'youtube:%s' % self._FEED_NAME
1510
1511 def _real_initialize(self):
1512 self._login()
1513
1514 def _real_extract(self, url):
1515 feed_entries = []
1516 paging = 0
1517 for i in itertools.count(1):
1518 info = self._download_json(self._FEED_TEMPLATE % paging,
1519 '%s feed' % self._FEED_NAME,
1520 'Downloading page %s' % i)
1521 feed_html = info.get('feed_html') or info.get('content_html')
1522 load_more_widget_html = info.get('load_more_widget_html') or feed_html
1523 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1524 ids = orderedSet(m.group(1) for m in m_ids)
1525 feed_entries.extend(
1526 self.url_result(video_id, 'Youtube', video_id=video_id)
1527 for video_id in ids)
1528 mobj = re.search(
1529 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1530 load_more_widget_html)
1531 if mobj is None:
1532 break
1533 paging = mobj.group('paging')
1534 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1535
1536 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1537 IE_DESC = 'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1538 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1539 _FEED_NAME = 'recommended'
1540 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1541
1542 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1543 IE_DESC = 'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1544 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1545 _FEED_NAME = 'watch_later'
1546 _PLAYLIST_TITLE = 'Youtube Watch Later'
1547 _PERSONAL_FEED = True
1548
1549 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1550 IE_DESC = 'Youtube watch history, "ythistory" keyword (requires authentication)'
1551 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1552 _FEED_NAME = 'history'
1553 _PERSONAL_FEED = True
1554 _PLAYLIST_TITLE = 'Youtube Watch History'
1555
1556 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1557 IE_NAME = 'youtube:favorites'
1558 IE_DESC = 'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1559 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1560 _LOGIN_REQUIRED = True
1561
1562 def _real_extract(self, url):
1563 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1564 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
1565 return self.url_result(playlist_id, 'YoutubePlaylist')
1566
1567
1568 class YoutubeSubscriptionsIE(YoutubePlaylistIE):
1569 IE_NAME = 'youtube:subscriptions'
1570 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1571 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1572 _TESTS = []
1573
1574 def _real_extract(self, url):
1575 title = 'Youtube Subscriptions'
1576 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1577
1578 # The extraction process is the same as for playlists, but the regex
1579 # for the video ids doesn't contain an index
1580 ids = []
1581 more_widget_html = content_html = page
1582
1583 for page_num in itertools.count(1):
1584 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1585 new_ids = orderedSet(matches)
1586 ids.extend(new_ids)
1587
1588 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1589 if not mobj:
1590 break
1591
1592 more = self._download_json(
1593 'https://youtube.com/%s' % mobj.group('more'), title,
1594 'Downloading page #%s' % page_num,
1595 transform_source=uppercase_escape)
1596 content_html = more['content_html']
1597 more_widget_html = more['load_more_widget_html']
1598
1599 return {
1600 '_type': 'playlist',
1601 'title': title,
1602 'entries': self._ids_to_results(ids),
1603 }
1604
1605
1606 class YoutubeTruncatedURLIE(InfoExtractor):
1607 IE_NAME = 'youtube:truncated_url'
1608 IE_DESC = False # Do not list
1609 _VALID_URL = r'''(?x)
1610 (?:https?://)?[^/]+/watch\?(?:
1611 feature=[a-z_]+|
1612 annotation_id=annotation_[^&]+
1613 )?$|
1614 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1615 '''
1616
1617 _TESTS = [{
1618 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1619 'only_matching': True,
1620 }, {
1621 'url': 'http://www.youtube.com/watch?',
1622 'only_matching': True,
1623 }]
1624
1625 def _real_extract(self, url):
1626 raise ExtractorError(
1627 'Did you forget to quote the URL? Remember that & is a meta '
1628 'character in most shells, so you want to put the URL in quotes, '
1629 'like youtube-dl '
1630 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1631 ' or simply youtube-dl BaW_jenozKc .',
1632 expected=True)