]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/youtube.py
[youtube] Clarify output
[yt-dlp.git] / youtube_dl / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5
6 import itertools
7 import json
8 import os.path
9 import re
10 import traceback
11
12 from .common import InfoExtractor, SearchInfoExtractor
13 from .subtitles import SubtitlesInfoExtractor
14 from ..jsinterp import JSInterpreter
15 from ..swfinterp import SWFInterpreter
16 from ..utils import (
17 compat_chr,
18 compat_parse_qs,
19 compat_urllib_parse,
20 compat_urllib_request,
21 compat_urlparse,
22 compat_str,
23
24 clean_html,
25 get_element_by_id,
26 get_element_by_attribute,
27 ExtractorError,
28 int_or_none,
29 OnDemandPagedList,
30 unescapeHTML,
31 unified_strdate,
32 orderedSet,
33 uppercase_escape,
34 )
35
36 class YoutubeBaseInfoExtractor(InfoExtractor):
37 """Provide base functions for Youtube extractors"""
38 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
39 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
40 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
41 _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
45
46 def _set_language(self):
47 return bool(self._download_webpage(
48 self._LANG_URL, None,
49 note='Setting language', errnote='unable to set language',
50 fatal=False))
51
52 def _login(self):
53 """
54 Attempt to log in to YouTube.
55 True is returned if successful or skipped.
56 False is returned if login failed.
57
58 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
59 """
60 (username, password) = self._get_login_info()
61 # No authentication to be performed
62 if username is None:
63 if self._LOGIN_REQUIRED:
64 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
65 return True
66
67 login_page = self._download_webpage(
68 self._LOGIN_URL, None,
69 note='Downloading login page',
70 errnote='unable to fetch login page', fatal=False)
71 if login_page is False:
72 return
73
74 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
75 login_page, 'Login GALX parameter')
76
77 # Log in
78 login_form_strs = {
79 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
80 'Email': username,
81 'GALX': galx,
82 'Passwd': password,
83
84 'PersistentCookie': 'yes',
85 '_utf8': '霱',
86 'bgresponse': 'js_disabled',
87 'checkConnection': '',
88 'checkedDomains': 'youtube',
89 'dnConn': '',
90 'pstMsg': '0',
91 'rmShown': '1',
92 'secTok': '',
93 'signIn': 'Sign in',
94 'timeStmp': '',
95 'service': 'youtube',
96 'uilel': '3',
97 'hl': 'en_US',
98 }
99
100 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
101 # chokes on unicode
102 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
103 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
104
105 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
106 login_results = self._download_webpage(
107 req, None,
108 note='Logging in', errnote='unable to log in', fatal=False)
109 if login_results is False:
110 return False
111
112 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
113 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
114
115 # Two-Factor
116 # TODO add SMS and phone call support - these require making a request and then prompting the user
117
118 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
119 tfa_code = self._get_tfa_info()
120
121 if tfa_code is None:
122 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
123 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
124 return False
125
126 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
127
128 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
129 if match is None:
130 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
131 secTok = match.group(1)
132 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
133 if match is None:
134 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
135 timeStmp = match.group(1)
136
137 tfa_form_strs = {
138 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
139 'smsToken': '',
140 'smsUserPin': tfa_code,
141 'smsVerifyPin': 'Verify',
142
143 'PersistentCookie': 'yes',
144 'checkConnection': '',
145 'checkedDomains': 'youtube',
146 'pstMsg': '1',
147 'secTok': secTok,
148 'timeStmp': timeStmp,
149 'service': 'youtube',
150 'hl': 'en_US',
151 }
152 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in tfa_form_strs.items())
153 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
154
155 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
156 tfa_results = self._download_webpage(
157 tfa_req, None,
158 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
159
160 if tfa_results is False:
161 return False
162
163 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
164 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
165 return False
166 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
167 self._downloader.report_warning('unable to log in - did the page structure change?')
168 return False
169 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
170 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
171 return False
172
173 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
174 self._downloader.report_warning('unable to log in: bad username or password')
175 return False
176 return True
177
178 def _confirm_age(self):
179 age_form = {
180 'next_url': '/',
181 'action_confirm': 'Confirm',
182 }
183 req = compat_urllib_request.Request(self._AGE_URL,
184 compat_urllib_parse.urlencode(age_form).encode('ascii'))
185
186 self._download_webpage(
187 req, None,
188 note='Confirming age', errnote='Unable to confirm age',
189 fatal=False)
190
191 def _real_initialize(self):
192 if self._downloader is None:
193 return
194 if self._get_login_info()[0] is not None:
195 if not self._set_language():
196 return
197 if not self._login():
198 return
199 self._confirm_age()
200
201
202 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
203 IE_DESC = 'YouTube.com'
204 _VALID_URL = r"""(?x)^
205 (
206 (?:https?://|//) # http(s):// or protocol-independent URL
207 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
208 (?:www\.)?deturl\.com/www\.youtube\.com/|
209 (?:www\.)?pwnyoutube\.com/|
210 (?:www\.)?yourepeat\.com/|
211 tube\.majestyc\.net/|
212 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
213 (?:.*?\#/)? # handle anchor (#/) redirect urls
214 (?: # the various things that can precede the ID:
215 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
216 |(?: # or the v= param in all its forms
217 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
218 (?:\?|\#!?) # the params delimiter ? or # or #!
219 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
220 v=
221 )
222 ))
223 |youtu\.be/ # just youtu.be/xxxx
224 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
225 )
226 )? # all until now is optional -> you can pass the naked ID
227 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
228 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
229 (?(1).+)? # if we found the ID, everything can follow
230 $"""
231 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
232 _formats = {
233 '5': {'ext': 'flv', 'width': 400, 'height': 240},
234 '6': {'ext': 'flv', 'width': 450, 'height': 270},
235 '13': {'ext': '3gp'},
236 '17': {'ext': '3gp', 'width': 176, 'height': 144},
237 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
238 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
239 '34': {'ext': 'flv', 'width': 640, 'height': 360},
240 '35': {'ext': 'flv', 'width': 854, 'height': 480},
241 '36': {'ext': '3gp', 'width': 320, 'height': 240},
242 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
243 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
244 '43': {'ext': 'webm', 'width': 640, 'height': 360},
245 '44': {'ext': 'webm', 'width': 854, 'height': 480},
246 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
247 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
248
249
250 # 3d videos
251 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
252 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
253 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
254 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
255 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
256 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
257 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
258
259 # Apple HTTP Live Streaming
260 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
261 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
262 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
263 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
264 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
265 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
266 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
267
268 # DASH mp4 video
269 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
270 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
271 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
272 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
273 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
274 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
275 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
276 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
277 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
278 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
279 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
280
281 # Dash mp4 audio
282 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
283 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
284 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
285
286 # Dash webm
287 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
288 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
289 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
290 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
291 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
292 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
293 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
294 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
295 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
296 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
297 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
298 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
299 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
300 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
301 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
302 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
303 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
304 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
305
306 # Dash webm audio
307 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
308 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
309
310 # RTMP (unnamed)
311 '_rtmp': {'protocol': 'rtmp'},
312 }
313
314 IE_NAME = 'youtube'
315 _TESTS = [
316 {
317 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
318 'info_dict': {
319 'id': 'BaW_jenozKc',
320 'ext': 'mp4',
321 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
322 'uploader': 'Philipp Hagemeister',
323 'uploader_id': 'phihag',
324 'upload_date': '20121002',
325 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
326 'categories': ['Science & Technology'],
327 'like_count': int,
328 'dislike_count': int,
329 }
330 },
331 {
332 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
333 'note': 'Test generic use_cipher_signature video (#897)',
334 'info_dict': {
335 'id': 'UxxajLWwzqY',
336 'ext': 'mp4',
337 'upload_date': '20120506',
338 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
339 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
340 'uploader': 'Icona Pop',
341 'uploader_id': 'IconaPop',
342 }
343 },
344 {
345 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
346 'note': 'Test VEVO video with age protection (#956)',
347 'info_dict': {
348 'id': '07FYdnEawAQ',
349 'ext': 'mp4',
350 'upload_date': '20130703',
351 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
352 'description': 'md5:64249768eec3bc4276236606ea996373',
353 'uploader': 'justintimberlakeVEVO',
354 'uploader_id': 'justintimberlakeVEVO',
355 }
356 },
357 {
358 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
359 'note': 'Embed-only video (#1746)',
360 'info_dict': {
361 'id': 'yZIXLfi8CZQ',
362 'ext': 'mp4',
363 'upload_date': '20120608',
364 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
365 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
366 'uploader': 'SET India',
367 'uploader_id': 'setindia'
368 }
369 },
370 {
371 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
372 'note': '256k DASH audio (format 141) via DASH manifest',
373 'info_dict': {
374 'id': 'a9LDPn-MO4I',
375 'ext': 'm4a',
376 'upload_date': '20121002',
377 'uploader_id': '8KVIDEO',
378 'description': '',
379 'uploader': '8KVIDEO',
380 'title': 'UHDTV TEST 8K VIDEO.mp4'
381 },
382 'params': {
383 'youtube_include_dash_manifest': True,
384 'format': '141',
385 },
386 },
387 # DASH manifest with encrypted signature
388 {
389 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
390 'info_dict': {
391 'id': 'IB3lcPjvWLA',
392 'ext': 'm4a',
393 'title': 'Afrojack - The Spark ft. Spree Wilson',
394 'description': 'md5:9717375db5a9a3992be4668bbf3bc0a8',
395 'uploader': 'AfrojackVEVO',
396 'uploader_id': 'AfrojackVEVO',
397 'upload_date': '20131011',
398 },
399 'params': {
400 'youtube_include_dash_manifest': True,
401 'format': '141',
402 },
403 },
404 ]
405
406 def __init__(self, *args, **kwargs):
407 super(YoutubeIE, self).__init__(*args, **kwargs)
408 self._player_cache = {}
409
410 def report_video_info_webpage_download(self, video_id):
411 """Report attempt to download video info webpage."""
412 self.to_screen('%s: Downloading video info webpage' % video_id)
413
414 def report_information_extraction(self, video_id):
415 """Report attempt to extract video information."""
416 self.to_screen('%s: Extracting video information' % video_id)
417
418 def report_unavailable_format(self, video_id, format):
419 """Report extracted video URL."""
420 self.to_screen('%s: Format %s not available' % (video_id, format))
421
422 def report_rtmp_download(self):
423 """Indicate the download will use the RTMP protocol."""
424 self.to_screen('RTMP download detected')
425
426 def _signature_cache_id(self, example_sig):
427 """ Return a string representation of a signature """
428 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
429
430 def _extract_signature_function(self, video_id, player_url, example_sig):
431 id_m = re.match(
432 r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
433 player_url)
434 if not id_m:
435 raise ExtractorError('Cannot identify player %r' % player_url)
436 player_type = id_m.group('ext')
437 player_id = id_m.group('id')
438
439 # Read from filesystem cache
440 func_id = '%s_%s_%s' % (
441 player_type, player_id, self._signature_cache_id(example_sig))
442 assert os.path.basename(func_id) == func_id
443
444 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
445 if cache_spec is not None:
446 return lambda s: ''.join(s[i] for i in cache_spec)
447
448 if player_type == 'js':
449 code = self._download_webpage(
450 player_url, video_id,
451 note='Downloading %s player %s' % (player_type, player_id),
452 errnote='Download of %s failed' % player_url)
453 res = self._parse_sig_js(code)
454 elif player_type == 'swf':
455 urlh = self._request_webpage(
456 player_url, video_id,
457 note='Downloading %s player %s' % (player_type, player_id),
458 errnote='Download of %s failed' % player_url)
459 code = urlh.read()
460 res = self._parse_sig_swf(code)
461 else:
462 assert False, 'Invalid player type %r' % player_type
463
464 if cache_spec is None:
465 test_string = ''.join(map(compat_chr, range(len(example_sig))))
466 cache_res = res(test_string)
467 cache_spec = [ord(c) for c in cache_res]
468
469 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
470 return res
471
472 def _print_sig_code(self, func, example_sig):
473 def gen_sig_code(idxs):
474 def _genslice(start, end, step):
475 starts = '' if start == 0 else str(start)
476 ends = (':%d' % (end+step)) if end + step >= 0 else ':'
477 steps = '' if step == 1 else (':%d' % step)
478 return 's[%s%s%s]' % (starts, ends, steps)
479
480 step = None
481 start = '(Never used)' # Quelch pyflakes warnings - start will be
482 # set as soon as step is set
483 for i, prev in zip(idxs[1:], idxs[:-1]):
484 if step is not None:
485 if i - prev == step:
486 continue
487 yield _genslice(start, prev, step)
488 step = None
489 continue
490 if i - prev in [-1, 1]:
491 step = i - prev
492 start = prev
493 continue
494 else:
495 yield 's[%d]' % prev
496 if step is None:
497 yield 's[%d]' % i
498 else:
499 yield _genslice(start, i, step)
500
501 test_string = ''.join(map(compat_chr, range(len(example_sig))))
502 cache_res = func(test_string)
503 cache_spec = [ord(c) for c in cache_res]
504 expr_code = ' + '.join(gen_sig_code(cache_spec))
505 signature_id_tuple = '(%s)' % (
506 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
507 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
508 ' return %s\n') % (signature_id_tuple, expr_code)
509 self.to_screen('Extracted signature function:\n' + code)
510
511 def _parse_sig_js(self, jscode):
512 funcname = self._search_regex(
513 r'signature=([$a-zA-Z]+)', jscode,
514 'Initial JS player signature function name')
515
516 jsi = JSInterpreter(jscode)
517 initial_function = jsi.extract_function(funcname)
518 return lambda s: initial_function([s])
519
520 def _parse_sig_swf(self, file_contents):
521 swfi = SWFInterpreter(file_contents)
522 TARGET_CLASSNAME = 'SignatureDecipher'
523 searched_class = swfi.extract_class(TARGET_CLASSNAME)
524 initial_function = swfi.extract_function(searched_class, 'decipher')
525 return lambda s: initial_function([s])
526
527 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
528 """Turn the encrypted s field into a working signature"""
529
530 if player_url is None:
531 raise ExtractorError('Cannot decrypt signature without player_url')
532
533 if player_url.startswith('//'):
534 player_url = 'https:' + player_url
535 try:
536 player_id = (player_url, self._signature_cache_id(s))
537 if player_id not in self._player_cache:
538 func = self._extract_signature_function(
539 video_id, player_url, s
540 )
541 self._player_cache[player_id] = func
542 func = self._player_cache[player_id]
543 if self._downloader.params.get('youtube_print_sig_code'):
544 self._print_sig_code(func, s)
545 return func(s)
546 except Exception as e:
547 tb = traceback.format_exc()
548 raise ExtractorError(
549 'Signature extraction failed: ' + tb, cause=e)
550
551 def _get_available_subtitles(self, video_id, webpage):
552 try:
553 sub_list = self._download_webpage(
554 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
555 video_id, note=False)
556 except ExtractorError as err:
557 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
558 return {}
559 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
560
561 sub_lang_list = {}
562 for l in lang_list:
563 lang = l[1]
564 if lang in sub_lang_list:
565 continue
566 params = compat_urllib_parse.urlencode({
567 'lang': lang,
568 'v': video_id,
569 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
570 'name': unescapeHTML(l[0]).encode('utf-8'),
571 })
572 url = 'https://www.youtube.com/api/timedtext?' + params
573 sub_lang_list[lang] = url
574 if not sub_lang_list:
575 self._downloader.report_warning('video doesn\'t have subtitles')
576 return {}
577 return sub_lang_list
578
579 def _get_available_automatic_caption(self, video_id, webpage):
580 """We need the webpage for getting the captions url, pass it as an
581 argument to speed up the process."""
582 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
583 self.to_screen('%s: Looking for automatic captions' % video_id)
584 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
585 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
586 if mobj is None:
587 self._downloader.report_warning(err_msg)
588 return {}
589 player_config = json.loads(mobj.group(1))
590 try:
591 args = player_config[u'args']
592 caption_url = args[u'ttsurl']
593 timestamp = args[u'timestamp']
594 # We get the available subtitles
595 list_params = compat_urllib_parse.urlencode({
596 'type': 'list',
597 'tlangs': 1,
598 'asrs': 1,
599 })
600 list_url = caption_url + '&' + list_params
601 caption_list = self._download_xml(list_url, video_id)
602 original_lang_node = caption_list.find('track')
603 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
604 self._downloader.report_warning('Video doesn\'t have automatic captions')
605 return {}
606 original_lang = original_lang_node.attrib['lang_code']
607
608 sub_lang_list = {}
609 for lang_node in caption_list.findall('target'):
610 sub_lang = lang_node.attrib['lang_code']
611 params = compat_urllib_parse.urlencode({
612 'lang': original_lang,
613 'tlang': sub_lang,
614 'fmt': sub_format,
615 'ts': timestamp,
616 'kind': 'asr',
617 })
618 sub_lang_list[sub_lang] = caption_url + '&' + params
619 return sub_lang_list
620 # An extractor error can be raise by the download process if there are
621 # no automatic captions but there are subtitles
622 except (KeyError, ExtractorError):
623 self._downloader.report_warning(err_msg)
624 return {}
625
626 @classmethod
627 def extract_id(cls, url):
628 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
629 if mobj is None:
630 raise ExtractorError('Invalid URL: %s' % url)
631 video_id = mobj.group(2)
632 return video_id
633
634 def _extract_from_m3u8(self, manifest_url, video_id):
635 url_map = {}
636 def _get_urls(_manifest):
637 lines = _manifest.split('\n')
638 urls = filter(lambda l: l and not l.startswith('#'),
639 lines)
640 return urls
641 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
642 formats_urls = _get_urls(manifest)
643 for format_url in formats_urls:
644 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
645 url_map[itag] = format_url
646 return url_map
647
648 def _extract_annotations(self, video_id):
649 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
650 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
651
652 def _real_extract(self, url):
653 proto = (
654 'http' if self._downloader.params.get('prefer_insecure', False)
655 else 'https')
656
657 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
658 mobj = re.search(self._NEXT_URL_RE, url)
659 if mobj:
660 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
661 video_id = self.extract_id(url)
662
663 # Get video webpage
664 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
665 pref_cookies = [
666 c for c in self._downloader.cookiejar
667 if c.domain == '.youtube.com' and c.name == 'PREF']
668 for pc in pref_cookies:
669 if 'hl=' in pc.value:
670 pc.value = re.sub(r'hl=[^&]+', 'hl=en', pc.value)
671 else:
672 if pc.value:
673 pc.value += '&'
674 pc.value += 'hl=en'
675 video_webpage = self._download_webpage(url, video_id)
676
677 # Attempt to extract SWF player URL
678 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
679 if mobj is not None:
680 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
681 else:
682 player_url = None
683
684 # Get video info
685 self.report_video_info_webpage_download(video_id)
686 if re.search(r'player-age-gate-content">', video_webpage) is not None:
687 age_gate = True
688 # We simulate the access to the video from www.youtube.com/v/{video_id}
689 # this can be viewed without login into Youtube
690 data = compat_urllib_parse.urlencode({
691 'video_id': video_id,
692 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
693 'sts': self._search_regex(
694 r'"sts"\s*:\s*(\d+)', video_webpage, 'sts', default=''),
695 })
696 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
697 video_info_webpage = self._download_webpage(
698 video_info_url, video_id,
699 note='Refetching age-gated info webpage',
700 errnote='unable to download video info webpage')
701 video_info = compat_parse_qs(video_info_webpage)
702 else:
703 age_gate = False
704 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
705 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
706 % (video_id, el_type))
707 video_info_webpage = self._download_webpage(video_info_url, video_id,
708 note=False,
709 errnote='unable to download video info webpage')
710 video_info = compat_parse_qs(video_info_webpage)
711 if 'token' in video_info:
712 break
713 if 'token' not in video_info:
714 if 'reason' in video_info:
715 raise ExtractorError(
716 'YouTube said: %s' % video_info['reason'][0],
717 expected=True, video_id=video_id)
718 else:
719 raise ExtractorError(
720 '"token" parameter not in video info for unknown reason',
721 video_id=video_id)
722
723 if 'view_count' in video_info:
724 view_count = int(video_info['view_count'][0])
725 else:
726 view_count = None
727
728 # Check for "rental" videos
729 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
730 raise ExtractorError('"rental" videos not supported')
731
732 # Start extracting information
733 self.report_information_extraction(video_id)
734
735 # uploader
736 if 'author' not in video_info:
737 raise ExtractorError('Unable to extract uploader name')
738 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
739
740 # uploader_id
741 video_uploader_id = None
742 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
743 if mobj is not None:
744 video_uploader_id = mobj.group(1)
745 else:
746 self._downloader.report_warning('unable to extract uploader nickname')
747
748 # title
749 if 'title' in video_info:
750 video_title = video_info['title'][0]
751 else:
752 self._downloader.report_warning('Unable to extract video title')
753 video_title = '_'
754
755 # thumbnail image
756 # We try first to get a high quality image:
757 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
758 video_webpage, re.DOTALL)
759 if m_thumb is not None:
760 video_thumbnail = m_thumb.group(1)
761 elif 'thumbnail_url' not in video_info:
762 self._downloader.report_warning('unable to extract video thumbnail')
763 video_thumbnail = None
764 else: # don't panic if we can't find it
765 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
766
767 # upload date
768 upload_date = None
769 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
770 if mobj is None:
771 mobj = re.search(
772 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
773 video_webpage)
774 if mobj is not None:
775 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
776 upload_date = unified_strdate(upload_date)
777
778 m_cat_container = self._search_regex(
779 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
780 video_webpage, 'categories', fatal=False)
781 if m_cat_container:
782 category = self._html_search_regex(
783 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
784 default=None)
785 video_categories = None if category is None else [category]
786 else:
787 video_categories = None
788
789 # description
790 video_description = get_element_by_id("eow-description", video_webpage)
791 if video_description:
792 video_description = re.sub(r'''(?x)
793 <a\s+
794 (?:[a-zA-Z-]+="[^"]+"\s+)*?
795 title="([^"]+)"\s+
796 (?:[a-zA-Z-]+="[^"]+"\s+)*?
797 class="yt-uix-redirect-link"\s*>
798 [^<]+
799 </a>
800 ''', r'\1', video_description)
801 video_description = clean_html(video_description)
802 else:
803 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
804 if fd_mobj:
805 video_description = unescapeHTML(fd_mobj.group(1))
806 else:
807 video_description = ''
808
809 def _extract_count(count_name):
810 count = self._search_regex(
811 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
812 video_webpage, count_name, default=None)
813 if count is not None:
814 return int(count.replace(',', ''))
815 return None
816 like_count = _extract_count('like')
817 dislike_count = _extract_count('dislike')
818
819 # subtitles
820 video_subtitles = self.extract_subtitles(video_id, video_webpage)
821
822 if self._downloader.params.get('listsubtitles', False):
823 self._list_available_subtitles(video_id, video_webpage)
824 return
825
826 if 'length_seconds' not in video_info:
827 self._downloader.report_warning('unable to extract video duration')
828 video_duration = None
829 else:
830 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
831
832 # annotations
833 video_annotations = None
834 if self._downloader.params.get('writeannotations', False):
835 video_annotations = self._extract_annotations(video_id)
836
837 # Decide which formats to download
838 try:
839 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
840 if not mobj:
841 raise ValueError('Could not find vevo ID')
842 json_code = uppercase_escape(mobj.group(1))
843 ytplayer_config = json.loads(json_code)
844 args = ytplayer_config['args']
845 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
846 # this signatures are encrypted
847 if 'url_encoded_fmt_stream_map' not in args:
848 raise ValueError('No stream_map present') # caught below
849 re_signature = re.compile(r'[&,]s=')
850 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
851 if m_s is not None:
852 self.to_screen('%s: Encrypted signatures detected.' % video_id)
853 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
854 m_s = re_signature.search(args.get('adaptive_fmts', ''))
855 if m_s is not None:
856 if 'adaptive_fmts' in video_info:
857 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
858 else:
859 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
860 except ValueError:
861 pass
862
863 def _map_to_format_list(urlmap):
864 formats = []
865 for itag, video_real_url in urlmap.items():
866 dct = {
867 'format_id': itag,
868 'url': video_real_url,
869 'player_url': player_url,
870 }
871 if itag in self._formats:
872 dct.update(self._formats[itag])
873 formats.append(dct)
874 return formats
875
876 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
877 self.report_rtmp_download()
878 formats = [{
879 'format_id': '_rtmp',
880 'protocol': 'rtmp',
881 'url': video_info['conn'][0],
882 'player_url': player_url,
883 }]
884 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
885 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
886 if 'rtmpe%3Dyes' in encoded_url_map:
887 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
888 url_map = {}
889 for url_data_str in encoded_url_map.split(','):
890 url_data = compat_parse_qs(url_data_str)
891 if 'itag' not in url_data or 'url' not in url_data:
892 continue
893 format_id = url_data['itag'][0]
894 url = url_data['url'][0]
895
896 if 'sig' in url_data:
897 url += '&signature=' + url_data['sig'][0]
898 elif 's' in url_data:
899 encrypted_sig = url_data['s'][0]
900
901 if not age_gate:
902 jsplayer_url_json = self._search_regex(
903 r'"assets":.+?"js":\s*("[^"]+")',
904 video_webpage, 'JS player URL')
905 player_url = json.loads(jsplayer_url_json)
906 if player_url is None:
907 player_url_json = self._search_regex(
908 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
909 video_webpage, 'age gate player URL')
910 player_url = json.loads(player_url_json)
911
912 if self._downloader.params.get('verbose'):
913 if player_url is None:
914 player_version = 'unknown'
915 player_desc = 'unknown'
916 else:
917 if player_url.endswith('swf'):
918 player_version = self._search_regex(
919 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
920 'flash player', fatal=False)
921 player_desc = 'flash player %s' % player_version
922 else:
923 player_version = self._search_regex(
924 r'html5player-([^/]+?)(?:/html5player)?\.js',
925 player_url,
926 'html5 player', fatal=False)
927 player_desc = 'html5 player %s' % player_version
928
929 parts_sizes = self._signature_cache_id(encrypted_sig)
930 self.to_screen('{%s} signature length %s, %s' %
931 (format_id, parts_sizes, player_desc))
932
933 signature = self._decrypt_signature(
934 encrypted_sig, video_id, player_url, age_gate)
935 url += '&signature=' + signature
936 if 'ratebypass' not in url:
937 url += '&ratebypass=yes'
938 url_map[format_id] = url
939 formats = _map_to_format_list(url_map)
940 elif video_info.get('hlsvp'):
941 manifest_url = video_info['hlsvp'][0]
942 url_map = self._extract_from_m3u8(manifest_url, video_id)
943 formats = _map_to_format_list(url_map)
944 else:
945 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
946
947 # Look for the DASH manifest
948 if self._downloader.params.get('youtube_include_dash_manifest', True):
949 try:
950 # The DASH manifest used needs to be the one from the original video_webpage.
951 # The one found in get_video_info seems to be using different signatures.
952 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
953 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
954 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
955 if age_gate:
956 dash_manifest_url = video_info.get('dashmpd')[0]
957 else:
958 dash_manifest_url = ytplayer_config['args']['dashmpd']
959 def decrypt_sig(mobj):
960 s = mobj.group(1)
961 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
962 return '/signature/%s' % dec_s
963 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
964 dash_doc = self._download_xml(
965 dash_manifest_url, video_id,
966 note='Downloading DASH manifest',
967 errnote='Could not download DASH manifest')
968 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
969 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
970 if url_el is None:
971 continue
972 format_id = r.attrib['id']
973 video_url = url_el.text
974 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
975 f = {
976 'format_id': format_id,
977 'url': video_url,
978 'width': int_or_none(r.attrib.get('width')),
979 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
980 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
981 'filesize': filesize,
982 }
983 try:
984 existing_format = next(
985 fo for fo in formats
986 if fo['format_id'] == format_id)
987 except StopIteration:
988 f.update(self._formats.get(format_id, {}))
989 formats.append(f)
990 else:
991 existing_format.update(f)
992
993 except (ExtractorError, KeyError) as e:
994 self.report_warning('Skipping DASH manifest: %s' % e, video_id)
995
996 self._sort_formats(formats)
997
998 return {
999 'id': video_id,
1000 'uploader': video_uploader,
1001 'uploader_id': video_uploader_id,
1002 'upload_date': upload_date,
1003 'title': video_title,
1004 'thumbnail': video_thumbnail,
1005 'description': video_description,
1006 'categories': video_categories,
1007 'subtitles': video_subtitles,
1008 'duration': video_duration,
1009 'age_limit': 18 if age_gate else 0,
1010 'annotations': video_annotations,
1011 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1012 'view_count': view_count,
1013 'like_count': like_count,
1014 'dislike_count': dislike_count,
1015 'formats': formats,
1016 }
1017
1018 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1019 IE_DESC = 'YouTube.com playlists'
1020 _VALID_URL = r"""(?x)(?:
1021 (?:https?://)?
1022 (?:\w+\.)?
1023 youtube\.com/
1024 (?:
1025 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
1026 \? (?:.*?&)*? (?:p|a|list)=
1027 | p/
1028 )
1029 (
1030 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1031 # Top tracks, they can also include dots
1032 |(?:MC)[\w\.]*
1033 )
1034 .*
1035 |
1036 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1037 )"""
1038 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1039 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1040 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1041 IE_NAME = 'youtube:playlist'
1042 _TESTS = [{
1043 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1044 'info_dict': {
1045 'title': 'ytdl test PL',
1046 },
1047 'playlist_count': 3,
1048 }, {
1049 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1050 'info_dict': {
1051 'title': 'YDL_Empty_List',
1052 },
1053 'playlist_count': 0,
1054 }, {
1055 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1056 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1057 'info_dict': {
1058 'title': '29C3: Not my department',
1059 },
1060 'playlist_count': 95,
1061 }, {
1062 'note': 'issue #673',
1063 'url': 'PLBB231211A4F62143',
1064 'info_dict': {
1065 'title': '[OLD]Team Fortress 2 (Class-based LP)',
1066 },
1067 'playlist_mincount': 26,
1068 }, {
1069 'note': 'Large playlist',
1070 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1071 'info_dict': {
1072 'title': 'Uploads from Cauchemar',
1073 },
1074 'playlist_mincount': 799,
1075 }, {
1076 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1077 'info_dict': {
1078 'title': 'YDL_safe_search',
1079 },
1080 'playlist_count': 2,
1081 }, {
1082 'note': 'embedded',
1083 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1084 'playlist_count': 4,
1085 'info_dict': {
1086 'title': 'JODA15',
1087 }
1088 }, {
1089 'note': 'Embedded SWF player',
1090 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1091 'playlist_count': 4,
1092 'info_dict': {
1093 'title': 'JODA7',
1094 }
1095 }]
1096
1097 def _real_initialize(self):
1098 self._login()
1099
1100 def _ids_to_results(self, ids):
1101 return [
1102 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1103 for vid_id in ids]
1104
1105 def _extract_mix(self, playlist_id):
1106 # The mixes are generated from a a single video
1107 # the id of the playlist is just 'RD' + video_id
1108 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1109 webpage = self._download_webpage(
1110 url, playlist_id, 'Downloading Youtube mix')
1111 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1112 title_span = (
1113 search_title('playlist-title') or
1114 search_title('title long-title') or
1115 search_title('title'))
1116 title = clean_html(title_span)
1117 ids = orderedSet(re.findall(
1118 r'''(?xs)data-video-username=".*?".*?
1119 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1120 webpage))
1121 url_results = self._ids_to_results(ids)
1122
1123 return self.playlist_result(url_results, playlist_id, title)
1124
1125 def _real_extract(self, url):
1126 # Extract playlist id
1127 mobj = re.match(self._VALID_URL, url)
1128 if mobj is None:
1129 raise ExtractorError('Invalid URL: %s' % url)
1130 playlist_id = mobj.group(1) or mobj.group(2)
1131
1132 # Check if it's a video-specific URL
1133 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1134 if 'v' in query_dict:
1135 video_id = query_dict['v'][0]
1136 if self._downloader.params.get('noplaylist'):
1137 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1138 return self.url_result(video_id, 'Youtube', video_id=video_id)
1139 else:
1140 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1141
1142 if playlist_id.startswith('RD'):
1143 # Mixes require a custom extraction process
1144 return self._extract_mix(playlist_id)
1145 if playlist_id.startswith('TL'):
1146 raise ExtractorError('For downloading YouTube.com top lists, use '
1147 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1148
1149 url = self._TEMPLATE_URL % playlist_id
1150 page = self._download_webpage(url, playlist_id)
1151 more_widget_html = content_html = page
1152
1153 # Check if the playlist exists or is private
1154 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1155 raise ExtractorError(
1156 'The playlist doesn\'t exist or is private, use --username or '
1157 '--netrc to access it.',
1158 expected=True)
1159
1160 # Extract the video ids from the playlist pages
1161 ids = []
1162
1163 for page_num in itertools.count(1):
1164 matches = re.finditer(self._VIDEO_RE, content_html)
1165 # We remove the duplicates and the link with index 0
1166 # (it's not the first video of the playlist)
1167 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1168 ids.extend(new_ids)
1169
1170 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1171 if not mobj:
1172 break
1173
1174 more = self._download_json(
1175 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1176 'Downloading page #%s' % page_num,
1177 transform_source=uppercase_escape)
1178 content_html = more['content_html']
1179 more_widget_html = more['load_more_widget_html']
1180
1181 playlist_title = self._html_search_regex(
1182 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1183 page, 'title')
1184
1185 url_results = self._ids_to_results(ids)
1186 return self.playlist_result(url_results, playlist_id, playlist_title)
1187
1188
1189 class YoutubeTopListIE(YoutubePlaylistIE):
1190 IE_NAME = 'youtube:toplist'
1191 IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1192 ' (Example: "yttoplist:music:Top Tracks")')
1193 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1194 _TESTS = [{
1195 'url': 'yttoplist:music:Trending',
1196 'playlist_mincount': 5,
1197 'skip': 'Only works for logged-in users',
1198 }]
1199
1200 def _real_extract(self, url):
1201 mobj = re.match(self._VALID_URL, url)
1202 channel = mobj.group('chann')
1203 title = mobj.group('title')
1204 query = compat_urllib_parse.urlencode({'title': title})
1205 channel_page = self._download_webpage(
1206 'https://www.youtube.com/%s' % channel, title)
1207 link = self._html_search_regex(
1208 r'''(?x)
1209 <a\s+href="([^"]+)".*?>\s*
1210 <span\s+class="branded-page-module-title-text">\s*
1211 <span[^>]*>.*?%s.*?</span>''' % re.escape(query),
1212 channel_page, 'list')
1213 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1214
1215 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1216 ids = []
1217 # sometimes the webpage doesn't contain the videos
1218 # retry until we get them
1219 for i in itertools.count(0):
1220 msg = 'Downloading Youtube mix'
1221 if i > 0:
1222 msg += ', retry #%d' % i
1223
1224 webpage = self._download_webpage(url, title, msg)
1225 ids = orderedSet(re.findall(video_re, webpage))
1226 if ids:
1227 break
1228 url_results = self._ids_to_results(ids)
1229 return self.playlist_result(url_results, playlist_title=title)
1230
1231
1232 class YoutubeChannelIE(InfoExtractor):
1233 IE_DESC = 'YouTube.com channels'
1234 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1235 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1236 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1237 IE_NAME = 'youtube:channel'
1238 _TESTS = [{
1239 'note': 'paginated channel',
1240 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1241 'playlist_mincount': 91,
1242 }]
1243
1244 def extract_videos_from_page(self, page):
1245 ids_in_page = []
1246 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1247 if mobj.group(1) not in ids_in_page:
1248 ids_in_page.append(mobj.group(1))
1249 return ids_in_page
1250
1251 def _real_extract(self, url):
1252 # Extract channel id
1253 mobj = re.match(self._VALID_URL, url)
1254 if mobj is None:
1255 raise ExtractorError('Invalid URL: %s' % url)
1256
1257 # Download channel page
1258 channel_id = mobj.group(1)
1259 video_ids = []
1260 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1261 channel_page = self._download_webpage(url, channel_id)
1262 autogenerated = re.search(r'''(?x)
1263 class="[^"]*?(?:
1264 channel-header-autogenerated-label|
1265 yt-channel-title-autogenerated
1266 )[^"]*"''', channel_page) is not None
1267
1268 if autogenerated:
1269 # The videos are contained in a single page
1270 # the ajax pages can't be used, they are empty
1271 video_ids = self.extract_videos_from_page(channel_page)
1272 else:
1273 # Download all channel pages using the json-based channel_ajax query
1274 for pagenum in itertools.count(1):
1275 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1276 page = self._download_json(
1277 url, channel_id, note='Downloading page #%s' % pagenum,
1278 transform_source=uppercase_escape)
1279
1280 ids_in_page = self.extract_videos_from_page(page['content_html'])
1281 video_ids.extend(ids_in_page)
1282
1283 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1284 break
1285
1286 self._downloader.to_screen('[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1287
1288 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1289 for video_id in video_ids]
1290 return self.playlist_result(url_entries, channel_id)
1291
1292
1293 class YoutubeUserIE(InfoExtractor):
1294 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
1295 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1296 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1297 _GDATA_PAGE_SIZE = 50
1298 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1299 IE_NAME = 'youtube:user'
1300
1301 _TESTS = [{
1302 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1303 'playlist_mincount': 320,
1304 'info_dict': {
1305 'title': 'TheLinuxFoundation',
1306 }
1307 }, {
1308 'url': 'ytuser:phihag',
1309 'only_matching': True,
1310 }]
1311
1312 @classmethod
1313 def suitable(cls, url):
1314 # Don't return True if the url can be extracted with other youtube
1315 # extractor, the regex would is too permissive and it would match.
1316 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1317 if any(ie.suitable(url) for ie in other_ies): return False
1318 else: return super(YoutubeUserIE, cls).suitable(url)
1319
1320 def _real_extract(self, url):
1321 # Extract username
1322 mobj = re.match(self._VALID_URL, url)
1323 if mobj is None:
1324 raise ExtractorError('Invalid URL: %s' % url)
1325
1326 username = mobj.group(1)
1327
1328 # Download video ids using YouTube Data API. Result size per
1329 # query is limited (currently to 50 videos) so we need to query
1330 # page by page until there are no video ids - it means we got
1331 # all of them.
1332
1333 def download_page(pagenum):
1334 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1335
1336 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1337 page = self._download_webpage(
1338 gdata_url, username,
1339 'Downloading video ids from %d to %d' % (
1340 start_index, start_index + self._GDATA_PAGE_SIZE))
1341
1342 try:
1343 response = json.loads(page)
1344 except ValueError as err:
1345 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
1346 if 'entry' not in response['feed']:
1347 return
1348
1349 # Extract video identifiers
1350 entries = response['feed']['entry']
1351 for entry in entries:
1352 title = entry['title']['$t']
1353 video_id = entry['id']['$t'].split('/')[-1]
1354 yield {
1355 '_type': 'url',
1356 'url': video_id,
1357 'ie_key': 'Youtube',
1358 'id': video_id,
1359 'title': title,
1360 }
1361 url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
1362
1363 return self.playlist_result(url_results, playlist_title=username)
1364
1365
1366 class YoutubeSearchIE(SearchInfoExtractor):
1367 IE_DESC = 'YouTube.com searches'
1368 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1369 _MAX_RESULTS = 1000
1370 IE_NAME = 'youtube:search'
1371 _SEARCH_KEY = 'ytsearch'
1372
1373 def _get_n_results(self, query, n):
1374 """Get a specified number of results for a query"""
1375
1376 video_ids = []
1377 pagenum = 0
1378 limit = n
1379 PAGE_SIZE = 50
1380
1381 while (PAGE_SIZE * pagenum) < limit:
1382 result_url = self._API_URL % (
1383 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1384 (PAGE_SIZE * pagenum) + 1)
1385 data_json = self._download_webpage(
1386 result_url, video_id='query "%s"' % query,
1387 note='Downloading page %s' % (pagenum + 1),
1388 errnote='Unable to download API page')
1389 data = json.loads(data_json)
1390 api_response = data['data']
1391
1392 if 'items' not in api_response:
1393 raise ExtractorError(
1394 '[youtube] No video results', expected=True)
1395
1396 new_ids = list(video['id'] for video in api_response['items'])
1397 video_ids += new_ids
1398
1399 limit = min(n, api_response['totalItems'])
1400 pagenum += 1
1401
1402 if len(video_ids) > n:
1403 video_ids = video_ids[:n]
1404 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1405 for video_id in video_ids]
1406 return self.playlist_result(videos, query)
1407
1408
1409 class YoutubeSearchDateIE(YoutubeSearchIE):
1410 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1411 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1412 _SEARCH_KEY = 'ytsearchdate'
1413 IE_DESC = 'YouTube.com searches, newest videos first'
1414
1415
1416 class YoutubeSearchURLIE(InfoExtractor):
1417 IE_DESC = 'YouTube.com search URLs'
1418 IE_NAME = 'youtube:search_url'
1419 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1420 _TESTS = [{
1421 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1422 'playlist_mincount': 5,
1423 'info_dict': {
1424 'title': 'youtube-dl test video',
1425 }
1426 }]
1427
1428 def _real_extract(self, url):
1429 mobj = re.match(self._VALID_URL, url)
1430 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1431
1432 webpage = self._download_webpage(url, query)
1433 result_code = self._search_regex(
1434 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
1435
1436 part_codes = re.findall(
1437 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1438 entries = []
1439 for part_code in part_codes:
1440 part_title = self._html_search_regex(
1441 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1442 part_url_snippet = self._html_search_regex(
1443 r'(?s)href="([^"]+)"', part_code, 'item URL')
1444 part_url = compat_urlparse.urljoin(
1445 'https://www.youtube.com/', part_url_snippet)
1446 entries.append({
1447 '_type': 'url',
1448 'url': part_url,
1449 'title': part_title,
1450 })
1451
1452 return {
1453 '_type': 'playlist',
1454 'entries': entries,
1455 'title': query,
1456 }
1457
1458
1459 class YoutubeShowIE(InfoExtractor):
1460 IE_DESC = 'YouTube.com (multi-season) shows'
1461 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
1462 IE_NAME = 'youtube:show'
1463 _TESTS = [{
1464 'url': 'http://www.youtube.com/show/airdisasters',
1465 'playlist_mincount': 3,
1466 'info_dict': {
1467 'id': 'airdisasters',
1468 'title': 'Air Disasters',
1469 }
1470 }]
1471
1472 def _real_extract(self, url):
1473 mobj = re.match(self._VALID_URL, url)
1474 playlist_id = mobj.group('id')
1475 webpage = self._download_webpage(
1476 url, playlist_id, 'Downloading show webpage')
1477 # There's one playlist for each season of the show
1478 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1479 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1480 entries = [
1481 self.url_result(
1482 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1483 for season in m_seasons
1484 ]
1485 title = self._og_search_title(webpage, fatal=False)
1486
1487 return {
1488 '_type': 'playlist',
1489 'id': playlist_id,
1490 'title': title,
1491 'entries': entries,
1492 }
1493
1494
1495 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1496 """
1497 Base class for extractors that fetch info from
1498 http://www.youtube.com/feed_ajax
1499 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1500 """
1501 _LOGIN_REQUIRED = True
1502 # use action_load_personal_feed instead of action_load_system_feed
1503 _PERSONAL_FEED = False
1504
1505 @property
1506 def _FEED_TEMPLATE(self):
1507 action = 'action_load_system_feed'
1508 if self._PERSONAL_FEED:
1509 action = 'action_load_personal_feed'
1510 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1511
1512 @property
1513 def IE_NAME(self):
1514 return 'youtube:%s' % self._FEED_NAME
1515
1516 def _real_initialize(self):
1517 self._login()
1518
1519 def _real_extract(self, url):
1520 feed_entries = []
1521 paging = 0
1522 for i in itertools.count(1):
1523 info = self._download_json(self._FEED_TEMPLATE % paging,
1524 '%s feed' % self._FEED_NAME,
1525 'Downloading page %s' % i)
1526 feed_html = info.get('feed_html') or info.get('content_html')
1527 load_more_widget_html = info.get('load_more_widget_html') or feed_html
1528 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1529 ids = orderedSet(m.group(1) for m in m_ids)
1530 feed_entries.extend(
1531 self.url_result(video_id, 'Youtube', video_id=video_id)
1532 for video_id in ids)
1533 mobj = re.search(
1534 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1535 load_more_widget_html)
1536 if mobj is None:
1537 break
1538 paging = mobj.group('paging')
1539 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1540
1541 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1542 IE_DESC = 'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1543 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1544 _FEED_NAME = 'recommended'
1545 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1546
1547 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1548 IE_DESC = 'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1549 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1550 _FEED_NAME = 'watch_later'
1551 _PLAYLIST_TITLE = 'Youtube Watch Later'
1552 _PERSONAL_FEED = True
1553
1554 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1555 IE_DESC = 'Youtube watch history, "ythistory" keyword (requires authentication)'
1556 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1557 _FEED_NAME = 'history'
1558 _PERSONAL_FEED = True
1559 _PLAYLIST_TITLE = 'Youtube Watch History'
1560
1561 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1562 IE_NAME = 'youtube:favorites'
1563 IE_DESC = 'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1564 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1565 _LOGIN_REQUIRED = True
1566
1567 def _real_extract(self, url):
1568 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1569 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
1570 return self.url_result(playlist_id, 'YoutubePlaylist')
1571
1572
1573 class YoutubeSubscriptionsIE(YoutubePlaylistIE):
1574 IE_NAME = 'youtube:subscriptions'
1575 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1576 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1577 _TESTS = []
1578
1579 def _real_extract(self, url):
1580 title = 'Youtube Subscriptions'
1581 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1582
1583 # The extraction process is the same as for playlists, but the regex
1584 # for the video ids doesn't contain an index
1585 ids = []
1586 more_widget_html = content_html = page
1587
1588 for page_num in itertools.count(1):
1589 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1590 new_ids = orderedSet(matches)
1591 ids.extend(new_ids)
1592
1593 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1594 if not mobj:
1595 break
1596
1597 more = self._download_json(
1598 'https://youtube.com/%s' % mobj.group('more'), title,
1599 'Downloading page #%s' % page_num,
1600 transform_source=uppercase_escape)
1601 content_html = more['content_html']
1602 more_widget_html = more['load_more_widget_html']
1603
1604 return {
1605 '_type': 'playlist',
1606 'title': title,
1607 'entries': self._ids_to_results(ids),
1608 }
1609
1610
1611 class YoutubeTruncatedURLIE(InfoExtractor):
1612 IE_NAME = 'youtube:truncated_url'
1613 IE_DESC = False # Do not list
1614 _VALID_URL = r'''(?x)
1615 (?:https?://)?[^/]+/watch\?(?:
1616 feature=[a-z_]+|
1617 annotation_id=annotation_[^&]+
1618 )?$|
1619 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1620 '''
1621
1622 _TESTS = [{
1623 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1624 'only_matching': True,
1625 }, {
1626 'url': 'http://www.youtube.com/watch?',
1627 'only_matching': True,
1628 }]
1629
1630 def _real_extract(self, url):
1631 raise ExtractorError(
1632 'Did you forget to quote the URL? Remember that & is a meta '
1633 'character in most shells, so you want to put the URL in quotes, '
1634 'like youtube-dl '
1635 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1636 ' or simply youtube-dl BaW_jenozKc .',
1637 expected=True)