]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/youtube.py
[nfl] Fix test case - download, but don't check md5
[yt-dlp.git] / youtube_dl / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5
6 import itertools
7 import json
8 import os.path
9 import re
10 import traceback
11
12 from .common import InfoExtractor, SearchInfoExtractor
13 from .subtitles import SubtitlesInfoExtractor
14 from ..jsinterp import JSInterpreter
15 from ..swfinterp import SWFInterpreter
16 from ..utils import (
17 compat_chr,
18 compat_parse_qs,
19 compat_urllib_parse,
20 compat_urllib_request,
21 compat_urlparse,
22 compat_str,
23
24 clean_html,
25 get_element_by_id,
26 get_element_by_attribute,
27 ExtractorError,
28 int_or_none,
29 PagedList,
30 unescapeHTML,
31 unified_strdate,
32 orderedSet,
33 uppercase_escape,
34 )
35
36 class YoutubeBaseInfoExtractor(InfoExtractor):
37 """Provide base functions for Youtube extractors"""
38 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
39 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
40 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
41 _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
45
46 def _set_language(self):
47 return bool(self._download_webpage(
48 self._LANG_URL, None,
49 note=u'Setting language', errnote='unable to set language',
50 fatal=False))
51
52 def _login(self):
53 """
54 Attempt to log in to YouTube.
55 True is returned if successful or skipped.
56 False is returned if login failed.
57
58 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
59 """
60 (username, password) = self._get_login_info()
61 # No authentication to be performed
62 if username is None:
63 if self._LOGIN_REQUIRED:
64 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
65 return True
66
67 login_page = self._download_webpage(
68 self._LOGIN_URL, None,
69 note=u'Downloading login page',
70 errnote=u'unable to fetch login page', fatal=False)
71 if login_page is False:
72 return
73
74 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
75 login_page, 'Login GALX parameter')
76
77 # Log in
78 login_form_strs = {
79 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
80 'Email': username,
81 'GALX': galx,
82 'Passwd': password,
83
84 'PersistentCookie': 'yes',
85 '_utf8': '霱',
86 'bgresponse': 'js_disabled',
87 'checkConnection': '',
88 'checkedDomains': 'youtube',
89 'dnConn': '',
90 'pstMsg': '0',
91 'rmShown': '1',
92 'secTok': '',
93 'signIn': 'Sign in',
94 'timeStmp': '',
95 'service': 'youtube',
96 'uilel': '3',
97 'hl': 'en_US',
98 }
99
100 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
101 # chokes on unicode
102 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
103 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
104
105 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
106 login_results = self._download_webpage(
107 req, None,
108 note=u'Logging in', errnote=u'unable to log in', fatal=False)
109 if login_results is False:
110 return False
111
112 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
113 raise ExtractorError(u'Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
114
115 # Two-Factor
116 # TODO add SMS and phone call support - these require making a request and then prompting the user
117
118 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
119 tfa_code = self._get_tfa_info()
120
121 if tfa_code is None:
122 self._downloader.report_warning(u'Two-factor authentication required. Provide it with --twofactor <code>')
123 self._downloader.report_warning(u'(Note that only TOTP (Google Authenticator App) codes work at this time.)')
124 return False
125
126 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
127
128 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
129 if match is None:
130 self._downloader.report_warning(u'Failed to get secTok - did the page structure change?')
131 secTok = match.group(1)
132 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
133 if match is None:
134 self._downloader.report_warning(u'Failed to get timeStmp - did the page structure change?')
135 timeStmp = match.group(1)
136
137 tfa_form_strs = {
138 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
139 'smsToken': '',
140 'smsUserPin': tfa_code,
141 'smsVerifyPin': 'Verify',
142
143 'PersistentCookie': 'yes',
144 'checkConnection': '',
145 'checkedDomains': 'youtube',
146 'pstMsg': '1',
147 'secTok': secTok,
148 'timeStmp': timeStmp,
149 'service': 'youtube',
150 'hl': 'en_US',
151 }
152 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in tfa_form_strs.items())
153 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
154
155 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
156 tfa_results = self._download_webpage(
157 tfa_req, None,
158 note=u'Submitting TFA code', errnote=u'unable to submit tfa', fatal=False)
159
160 if tfa_results is False:
161 return False
162
163 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
164 self._downloader.report_warning(u'Two-factor code expired. Please try again, or use a one-use backup code instead.')
165 return False
166 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
167 self._downloader.report_warning(u'unable to log in - did the page structure change?')
168 return False
169 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
170 self._downloader.report_warning(u'Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
171 return False
172
173 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
174 self._downloader.report_warning(u'unable to log in: bad username or password')
175 return False
176 return True
177
178 def _confirm_age(self):
179 age_form = {
180 'next_url': '/',
181 'action_confirm': 'Confirm',
182 }
183 req = compat_urllib_request.Request(self._AGE_URL,
184 compat_urllib_parse.urlencode(age_form).encode('ascii'))
185
186 self._download_webpage(
187 req, None,
188 note=u'Confirming age', errnote=u'Unable to confirm age')
189 return True
190
191 def _real_initialize(self):
192 if self._downloader is None:
193 return
194 if not self._set_language():
195 return
196 if not self._login():
197 return
198 self._confirm_age()
199
200
201 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
202 IE_DESC = 'YouTube.com'
203 _VALID_URL = r"""(?x)^
204 (
205 (?:https?://|//) # http(s):// or protocol-independent URL
206 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
207 (?:www\.)?deturl\.com/www\.youtube\.com/|
208 (?:www\.)?pwnyoutube\.com/|
209 (?:www\.)?yourepeat\.com/|
210 tube\.majestyc\.net/|
211 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
212 (?:.*?\#/)? # handle anchor (#/) redirect urls
213 (?: # the various things that can precede the ID:
214 (?:(?:v|embed|e)/) # v/ or embed/ or e/
215 |(?: # or the v= param in all its forms
216 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
217 (?:\?|\#!?) # the params delimiter ? or # or #!
218 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
219 v=
220 )
221 ))
222 |youtu\.be/ # just youtu.be/xxxx
223 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
224 )
225 )? # all until now is optional -> you can pass the naked ID
226 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
227 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
228 (?(1).+)? # if we found the ID, everything can follow
229 $"""
230 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
231 _formats = {
232 '5': {'ext': 'flv', 'width': 400, 'height': 240},
233 '6': {'ext': 'flv', 'width': 450, 'height': 270},
234 '13': {'ext': '3gp'},
235 '17': {'ext': '3gp', 'width': 176, 'height': 144},
236 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
237 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
238 '34': {'ext': 'flv', 'width': 640, 'height': 360},
239 '35': {'ext': 'flv', 'width': 854, 'height': 480},
240 '36': {'ext': '3gp', 'width': 320, 'height': 240},
241 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
242 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
243 '43': {'ext': 'webm', 'width': 640, 'height': 360},
244 '44': {'ext': 'webm', 'width': 854, 'height': 480},
245 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
246 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
247
248
249 # 3d videos
250 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
251 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
252 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
253 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
254 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
255 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
256 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
257
258 # Apple HTTP Live Streaming
259 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
260 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
261 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
262 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
263 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
264 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
265 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
266
267 # DASH mp4 video
268 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
269 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
270 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
271 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
272 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
273 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
274 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
275 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
276
277 # Dash mp4 audio
278 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
279 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
280 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
281
282 # Dash webm
283 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
284 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
285 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
286 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
287 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
288 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
289 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
290 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
291 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
292 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
293 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
294 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
295 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
296 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
297 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
298
299 # Dash webm audio
300 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
301 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
302
303 # RTMP (unnamed)
304 '_rtmp': {'protocol': 'rtmp'},
305 }
306
307 IE_NAME = 'youtube'
308 _TESTS = [
309 {
310 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
311 u"file": u"BaW_jenozKc.mp4",
312 u"info_dict": {
313 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
314 u"uploader": u"Philipp Hagemeister",
315 u"uploader_id": u"phihag",
316 u"upload_date": u"20121002",
317 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
318 u"categories": [u'Science & Technology'],
319 'like_count': int,
320 'dislike_count': int,
321 }
322 },
323 {
324 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
325 u"file": u"UxxajLWwzqY.mp4",
326 u"note": u"Test generic use_cipher_signature video (#897)",
327 u"info_dict": {
328 u"upload_date": u"20120506",
329 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
330 u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f",
331 u"uploader": u"Icona Pop",
332 u"uploader_id": u"IconaPop"
333 }
334 },
335 {
336 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
337 u"file": u"07FYdnEawAQ.mp4",
338 u"note": u"Test VEVO video with age protection (#956)",
339 u"info_dict": {
340 u"upload_date": u"20130703",
341 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
342 u"description": u"md5:64249768eec3bc4276236606ea996373",
343 u"uploader": u"justintimberlakeVEVO",
344 u"uploader_id": u"justintimberlakeVEVO"
345 }
346 },
347 {
348 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
349 u"file": u"yZIXLfi8CZQ.mp4",
350 u"note": u"Embed-only video (#1746)",
351 u"info_dict": {
352 u"upload_date": u"20120608",
353 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
354 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
355 u"uploader": u"SET India",
356 u"uploader_id": u"setindia"
357 }
358 },
359 {
360 u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
361 u"file": u"a9LDPn-MO4I.m4a",
362 u"note": u"256k DASH audio (format 141) via DASH manifest",
363 u"info_dict": {
364 u"upload_date": "20121002",
365 u"uploader_id": "8KVIDEO",
366 u"description": '',
367 u"uploader": "8KVIDEO",
368 u"title": "UHDTV TEST 8K VIDEO.mp4"
369 },
370 u"params": {
371 u"youtube_include_dash_manifest": True,
372 u"format": "141",
373 },
374 },
375 # DASH manifest with encrypted signature
376 {
377 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
378 'info_dict': {
379 'id': 'IB3lcPjvWLA',
380 'ext': 'm4a',
381 'title': 'Afrojack - The Spark ft. Spree Wilson',
382 'description': 'md5:9717375db5a9a3992be4668bbf3bc0a8',
383 'uploader': 'AfrojackVEVO',
384 'uploader_id': 'AfrojackVEVO',
385 'upload_date': '20131011',
386 },
387 u"params": {
388 'youtube_include_dash_manifest': True,
389 'format': '141',
390 },
391 },
392 ]
393
394 def __init__(self, *args, **kwargs):
395 super(YoutubeIE, self).__init__(*args, **kwargs)
396 self._player_cache = {}
397
398 def report_video_info_webpage_download(self, video_id):
399 """Report attempt to download video info webpage."""
400 self.to_screen(u'%s: Downloading video info webpage' % video_id)
401
402 def report_information_extraction(self, video_id):
403 """Report attempt to extract video information."""
404 self.to_screen(u'%s: Extracting video information' % video_id)
405
406 def report_unavailable_format(self, video_id, format):
407 """Report extracted video URL."""
408 self.to_screen(u'%s: Format %s not available' % (video_id, format))
409
410 def report_rtmp_download(self):
411 """Indicate the download will use the RTMP protocol."""
412 self.to_screen(u'RTMP download detected')
413
414 def _signature_cache_id(self, example_sig):
415 """ Return a string representation of a signature """
416 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
417
418 def _extract_signature_function(self, video_id, player_url, example_sig):
419 id_m = re.match(
420 r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
421 player_url)
422 if not id_m:
423 raise ExtractorError('Cannot identify player %r' % player_url)
424 player_type = id_m.group('ext')
425 player_id = id_m.group('id')
426
427 # Read from filesystem cache
428 func_id = '%s_%s_%s' % (
429 player_type, player_id, self._signature_cache_id(example_sig))
430 assert os.path.basename(func_id) == func_id
431
432 cache_spec = self._downloader.cache.load(u'youtube-sigfuncs', func_id)
433 if cache_spec is not None:
434 return lambda s: ''.join(s[i] for i in cache_spec)
435
436 if player_type == 'js':
437 code = self._download_webpage(
438 player_url, video_id,
439 note=u'Downloading %s player %s' % (player_type, player_id),
440 errnote=u'Download of %s failed' % player_url)
441 res = self._parse_sig_js(code)
442 elif player_type == 'swf':
443 urlh = self._request_webpage(
444 player_url, video_id,
445 note=u'Downloading %s player %s' % (player_type, player_id),
446 errnote=u'Download of %s failed' % player_url)
447 code = urlh.read()
448 res = self._parse_sig_swf(code)
449 else:
450 assert False, 'Invalid player type %r' % player_type
451
452 if cache_spec is None:
453 test_string = ''.join(map(compat_chr, range(len(example_sig))))
454 cache_res = res(test_string)
455 cache_spec = [ord(c) for c in cache_res]
456
457 self._downloader.cache.store(u'youtube-sigfuncs', func_id, cache_spec)
458 return res
459
460 def _print_sig_code(self, func, example_sig):
461 def gen_sig_code(idxs):
462 def _genslice(start, end, step):
463 starts = '' if start == 0 else str(start)
464 ends = (u':%d' % (end+step)) if end + step >= 0 else ':'
465 steps = '' if step == 1 else (u':%d' % step)
466 return 's[%s%s%s]' % (starts, ends, steps)
467
468 step = None
469 start = '(Never used)' # Quelch pyflakes warnings - start will be
470 # set as soon as step is set
471 for i, prev in zip(idxs[1:], idxs[:-1]):
472 if step is not None:
473 if i - prev == step:
474 continue
475 yield _genslice(start, prev, step)
476 step = None
477 continue
478 if i - prev in [-1, 1]:
479 step = i - prev
480 start = prev
481 continue
482 else:
483 yield 's[%d]' % prev
484 if step is None:
485 yield 's[%d]' % i
486 else:
487 yield _genslice(start, i, step)
488
489 test_string = ''.join(map(compat_chr, range(len(example_sig))))
490 cache_res = func(test_string)
491 cache_spec = [ord(c) for c in cache_res]
492 expr_code = ' + '.join(gen_sig_code(cache_spec))
493 signature_id_tuple = '(%s)' % (
494 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
495 code = (u'if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
496 ' return %s\n') % (signature_id_tuple, expr_code)
497 self.to_screen(u'Extracted signature function:\n' + code)
498
499 def _parse_sig_js(self, jscode):
500 funcname = self._search_regex(
501 r'signature=([$a-zA-Z]+)', jscode,
502 'Initial JS player signature function name')
503
504 jsi = JSInterpreter(jscode)
505 initial_function = jsi.extract_function(funcname)
506 return lambda s: initial_function([s])
507
508 def _parse_sig_swf(self, file_contents):
509 swfi = SWFInterpreter(file_contents)
510 TARGET_CLASSNAME = 'SignatureDecipher'
511 searched_class = swfi.extract_class(TARGET_CLASSNAME)
512 initial_function = swfi.extract_function(searched_class, 'decipher')
513 return lambda s: initial_function([s])
514
515 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
516 """Turn the encrypted s field into a working signature"""
517
518 if player_url is None:
519 raise ExtractorError(u'Cannot decrypt signature without player_url')
520
521 if player_url.startswith(u'//'):
522 player_url = 'https:' + player_url
523 try:
524 player_id = (player_url, self._signature_cache_id(s))
525 if player_id not in self._player_cache:
526 func = self._extract_signature_function(
527 video_id, player_url, s
528 )
529 self._player_cache[player_id] = func
530 func = self._player_cache[player_id]
531 if self._downloader.params.get('youtube_print_sig_code'):
532 self._print_sig_code(func, s)
533 return func(s)
534 except Exception as e:
535 tb = traceback.format_exc()
536 raise ExtractorError(
537 'Signature extraction failed: ' + tb, cause=e)
538
539 def _get_available_subtitles(self, video_id, webpage):
540 try:
541 sub_list = self._download_webpage(
542 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
543 video_id, note=False)
544 except ExtractorError as err:
545 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
546 return {}
547 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
548
549 sub_lang_list = {}
550 for l in lang_list:
551 lang = l[1]
552 if lang in sub_lang_list:
553 continue
554 params = compat_urllib_parse.urlencode({
555 'lang': lang,
556 'v': video_id,
557 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
558 'name': unescapeHTML(l[0]).encode('utf-8'),
559 })
560 url = 'https://www.youtube.com/api/timedtext?' + params
561 sub_lang_list[lang] = url
562 if not sub_lang_list:
563 self._downloader.report_warning(u'video doesn\'t have subtitles')
564 return {}
565 return sub_lang_list
566
567 def _get_available_automatic_caption(self, video_id, webpage):
568 """We need the webpage for getting the captions url, pass it as an
569 argument to speed up the process."""
570 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
571 self.to_screen(u'%s: Looking for automatic captions' % video_id)
572 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
573 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
574 if mobj is None:
575 self._downloader.report_warning(err_msg)
576 return {}
577 player_config = json.loads(mobj.group(1))
578 try:
579 args = player_config[u'args']
580 caption_url = args[u'ttsurl']
581 timestamp = args[u'timestamp']
582 # We get the available subtitles
583 list_params = compat_urllib_parse.urlencode({
584 'type': 'list',
585 'tlangs': 1,
586 'asrs': 1,
587 })
588 list_url = caption_url + '&' + list_params
589 caption_list = self._download_xml(list_url, video_id)
590 original_lang_node = caption_list.find('track')
591 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
592 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
593 return {}
594 original_lang = original_lang_node.attrib['lang_code']
595
596 sub_lang_list = {}
597 for lang_node in caption_list.findall('target'):
598 sub_lang = lang_node.attrib['lang_code']
599 params = compat_urllib_parse.urlencode({
600 'lang': original_lang,
601 'tlang': sub_lang,
602 'fmt': sub_format,
603 'ts': timestamp,
604 'kind': 'asr',
605 })
606 sub_lang_list[sub_lang] = caption_url + '&' + params
607 return sub_lang_list
608 # An extractor error can be raise by the download process if there are
609 # no automatic captions but there are subtitles
610 except (KeyError, ExtractorError):
611 self._downloader.report_warning(err_msg)
612 return {}
613
614 @classmethod
615 def extract_id(cls, url):
616 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
617 if mobj is None:
618 raise ExtractorError(u'Invalid URL: %s' % url)
619 video_id = mobj.group(2)
620 return video_id
621
622 def _extract_from_m3u8(self, manifest_url, video_id):
623 url_map = {}
624 def _get_urls(_manifest):
625 lines = _manifest.split('\n')
626 urls = filter(lambda l: l and not l.startswith('#'),
627 lines)
628 return urls
629 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
630 formats_urls = _get_urls(manifest)
631 for format_url in formats_urls:
632 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
633 url_map[itag] = format_url
634 return url_map
635
636 def _extract_annotations(self, video_id):
637 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
638 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
639
640 def _real_extract(self, url):
641 proto = (
642 'http' if self._downloader.params.get('prefer_insecure', False)
643 else 'https')
644
645 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
646 mobj = re.search(self._NEXT_URL_RE, url)
647 if mobj:
648 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
649 video_id = self.extract_id(url)
650
651 # Get video webpage
652 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
653 video_webpage = self._download_webpage(url, video_id)
654
655 # Attempt to extract SWF player URL
656 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
657 if mobj is not None:
658 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
659 else:
660 player_url = None
661
662 # Get video info
663 self.report_video_info_webpage_download(video_id)
664 if re.search(r'player-age-gate-content">', video_webpage) is not None:
665 self.report_age_confirmation()
666 age_gate = True
667 # We simulate the access to the video from www.youtube.com/v/{video_id}
668 # this can be viewed without login into Youtube
669 data = compat_urllib_parse.urlencode({
670 'video_id': video_id,
671 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
672 'sts': self._search_regex(
673 r'"sts"\s*:\s*(\d+)', video_webpage, 'sts'),
674 })
675 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
676 video_info_webpage = self._download_webpage(video_info_url, video_id,
677 note=False,
678 errnote='unable to download video info webpage')
679 video_info = compat_parse_qs(video_info_webpage)
680 else:
681 age_gate = False
682 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
683 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
684 % (video_id, el_type))
685 video_info_webpage = self._download_webpage(video_info_url, video_id,
686 note=False,
687 errnote='unable to download video info webpage')
688 video_info = compat_parse_qs(video_info_webpage)
689 if 'token' in video_info:
690 break
691 if 'token' not in video_info:
692 if 'reason' in video_info:
693 raise ExtractorError(
694 'YouTube said: %s' % video_info['reason'][0],
695 expected=True, video_id=video_id)
696 else:
697 raise ExtractorError(
698 '"token" parameter not in video info for unknown reason',
699 video_id=video_id)
700
701 if 'view_count' in video_info:
702 view_count = int(video_info['view_count'][0])
703 else:
704 view_count = None
705
706 # Check for "rental" videos
707 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
708 raise ExtractorError(u'"rental" videos not supported')
709
710 # Start extracting information
711 self.report_information_extraction(video_id)
712
713 # uploader
714 if 'author' not in video_info:
715 raise ExtractorError(u'Unable to extract uploader name')
716 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
717
718 # uploader_id
719 video_uploader_id = None
720 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
721 if mobj is not None:
722 video_uploader_id = mobj.group(1)
723 else:
724 self._downloader.report_warning(u'unable to extract uploader nickname')
725
726 # title
727 if 'title' in video_info:
728 video_title = video_info['title'][0]
729 else:
730 self._downloader.report_warning(u'Unable to extract video title')
731 video_title = '_'
732
733 # thumbnail image
734 # We try first to get a high quality image:
735 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
736 video_webpage, re.DOTALL)
737 if m_thumb is not None:
738 video_thumbnail = m_thumb.group(1)
739 elif 'thumbnail_url' not in video_info:
740 self._downloader.report_warning(u'unable to extract video thumbnail')
741 video_thumbnail = None
742 else: # don't panic if we can't find it
743 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
744
745 # upload date
746 upload_date = None
747 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
748 if mobj is None:
749 mobj = re.search(
750 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
751 video_webpage)
752 if mobj is not None:
753 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
754 upload_date = unified_strdate(upload_date)
755
756 m_cat_container = self._search_regex(
757 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
758 video_webpage, 'categories', fatal=False)
759 if m_cat_container:
760 category = self._html_search_regex(
761 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
762 default=None)
763 video_categories = None if category is None else [category]
764 else:
765 video_categories = None
766
767 # description
768 video_description = get_element_by_id("eow-description", video_webpage)
769 if video_description:
770 video_description = re.sub(r'''(?x)
771 <a\s+
772 (?:[a-zA-Z-]+="[^"]+"\s+)*?
773 title="([^"]+)"\s+
774 (?:[a-zA-Z-]+="[^"]+"\s+)*?
775 class="yt-uix-redirect-link"\s*>
776 [^<]+
777 </a>
778 ''', r'\1', video_description)
779 video_description = clean_html(video_description)
780 else:
781 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
782 if fd_mobj:
783 video_description = unescapeHTML(fd_mobj.group(1))
784 else:
785 video_description = ''
786
787 def _extract_count(count_name):
788 count = self._search_regex(
789 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
790 video_webpage, count_name, default=None)
791 if count is not None:
792 return int(count.replace(',', ''))
793 return None
794 like_count = _extract_count(u'like')
795 dislike_count = _extract_count(u'dislike')
796
797 # subtitles
798 video_subtitles = self.extract_subtitles(video_id, video_webpage)
799
800 if self._downloader.params.get('listsubtitles', False):
801 self._list_available_subtitles(video_id, video_webpage)
802 return
803
804 if 'length_seconds' not in video_info:
805 self._downloader.report_warning(u'unable to extract video duration')
806 video_duration = None
807 else:
808 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
809
810 # annotations
811 video_annotations = None
812 if self._downloader.params.get('writeannotations', False):
813 video_annotations = self._extract_annotations(video_id)
814
815 # Decide which formats to download
816 try:
817 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
818 if not mobj:
819 raise ValueError('Could not find vevo ID')
820 json_code = uppercase_escape(mobj.group(1))
821 ytplayer_config = json.loads(json_code)
822 args = ytplayer_config['args']
823 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
824 # this signatures are encrypted
825 if 'url_encoded_fmt_stream_map' not in args:
826 raise ValueError(u'No stream_map present') # caught below
827 re_signature = re.compile(r'[&,]s=')
828 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
829 if m_s is not None:
830 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
831 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
832 m_s = re_signature.search(args.get('adaptive_fmts', ''))
833 if m_s is not None:
834 if 'adaptive_fmts' in video_info:
835 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
836 else:
837 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
838 except ValueError:
839 pass
840
841 def _map_to_format_list(urlmap):
842 formats = []
843 for itag, video_real_url in urlmap.items():
844 dct = {
845 'format_id': itag,
846 'url': video_real_url,
847 'player_url': player_url,
848 }
849 if itag in self._formats:
850 dct.update(self._formats[itag])
851 formats.append(dct)
852 return formats
853
854 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
855 self.report_rtmp_download()
856 formats = [{
857 'format_id': '_rtmp',
858 'protocol': 'rtmp',
859 'url': video_info['conn'][0],
860 'player_url': player_url,
861 }]
862 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
863 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
864 if 'rtmpe%3Dyes' in encoded_url_map:
865 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
866 url_map = {}
867 for url_data_str in encoded_url_map.split(','):
868 url_data = compat_parse_qs(url_data_str)
869 if 'itag' not in url_data or 'url' not in url_data:
870 continue
871 format_id = url_data['itag'][0]
872 url = url_data['url'][0]
873
874 if 'sig' in url_data:
875 url += '&signature=' + url_data['sig'][0]
876 elif 's' in url_data:
877 encrypted_sig = url_data['s'][0]
878
879 if not age_gate:
880 jsplayer_url_json = self._search_regex(
881 r'"assets":.+?"js":\s*("[^"]+")',
882 video_webpage, 'JS player URL')
883 player_url = json.loads(jsplayer_url_json)
884 if player_url is None:
885 player_url_json = self._search_regex(
886 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
887 video_webpage, 'age gate player URL')
888 player_url = json.loads(player_url_json)
889
890 if self._downloader.params.get('verbose'):
891 if player_url is None:
892 player_version = 'unknown'
893 player_desc = 'unknown'
894 else:
895 if player_url.endswith('swf'):
896 player_version = self._search_regex(
897 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
898 'flash player', fatal=False)
899 player_desc = 'flash player %s' % player_version
900 else:
901 player_version = self._search_regex(
902 r'html5player-([^/]+?)(?:/html5player)?\.js',
903 player_url,
904 'html5 player', fatal=False)
905 player_desc = 'html5 player %s' % player_version
906
907 parts_sizes = self._signature_cache_id(encrypted_sig)
908 self.to_screen(u'{%s} signature length %s, %s' %
909 (format_id, parts_sizes, player_desc))
910
911 signature = self._decrypt_signature(
912 encrypted_sig, video_id, player_url, age_gate)
913 url += '&signature=' + signature
914 if 'ratebypass' not in url:
915 url += '&ratebypass=yes'
916 url_map[format_id] = url
917 formats = _map_to_format_list(url_map)
918 elif video_info.get('hlsvp'):
919 manifest_url = video_info['hlsvp'][0]
920 url_map = self._extract_from_m3u8(manifest_url, video_id)
921 formats = _map_to_format_list(url_map)
922 else:
923 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
924
925 # Look for the DASH manifest
926 if (self._downloader.params.get('youtube_include_dash_manifest', False)):
927 try:
928 # The DASH manifest used needs to be the one from the original video_webpage.
929 # The one found in get_video_info seems to be using different signatures.
930 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
931 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
932 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
933 if age_gate:
934 dash_manifest_url = video_info.get('dashmpd')[0]
935 else:
936 dash_manifest_url = ytplayer_config['args']['dashmpd']
937 def decrypt_sig(mobj):
938 s = mobj.group(1)
939 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
940 return '/signature/%s' % dec_s
941 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
942 dash_doc = self._download_xml(
943 dash_manifest_url, video_id,
944 note=u'Downloading DASH manifest',
945 errnote=u'Could not download DASH manifest')
946 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
947 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
948 if url_el is None:
949 continue
950 format_id = r.attrib['id']
951 video_url = url_el.text
952 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
953 f = {
954 'format_id': format_id,
955 'url': video_url,
956 'width': int_or_none(r.attrib.get('width')),
957 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
958 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
959 'filesize': filesize,
960 }
961 try:
962 existing_format = next(
963 fo for fo in formats
964 if fo['format_id'] == format_id)
965 except StopIteration:
966 f.update(self._formats.get(format_id, {}))
967 formats.append(f)
968 else:
969 existing_format.update(f)
970
971 except (ExtractorError, KeyError) as e:
972 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
973
974 self._sort_formats(formats)
975
976 return {
977 'id': video_id,
978 'uploader': video_uploader,
979 'uploader_id': video_uploader_id,
980 'upload_date': upload_date,
981 'title': video_title,
982 'thumbnail': video_thumbnail,
983 'description': video_description,
984 'categories': video_categories,
985 'subtitles': video_subtitles,
986 'duration': video_duration,
987 'age_limit': 18 if age_gate else 0,
988 'annotations': video_annotations,
989 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
990 'view_count': view_count,
991 'like_count': like_count,
992 'dislike_count': dislike_count,
993 'formats': formats,
994 }
995
996 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
997 IE_DESC = 'YouTube.com playlists'
998 _VALID_URL = r"""(?x)(?:
999 (?:https?://)?
1000 (?:\w+\.)?
1001 youtube\.com/
1002 (?:
1003 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1004 \? (?:.*?&)*? (?:p|a|list)=
1005 | p/
1006 )
1007 (
1008 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1009 # Top tracks, they can also include dots
1010 |(?:MC)[\w\.]*
1011 )
1012 .*
1013 |
1014 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1015 )"""
1016 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1017 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1018 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1019 IE_NAME = 'youtube:playlist'
1020 _TESTS = [{
1021 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1022 'info_dict': {
1023 'title': 'ytdl test PL',
1024 },
1025 'playlist_count': 3,
1026 }, {
1027 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1028 'info_dict': {
1029 'title': 'YDL_Empty_List',
1030 },
1031 'playlist_count': 0,
1032 }, {
1033 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1034 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1035 'info_dict': {
1036 'title': '29C3: Not my department',
1037 },
1038 'playlist_count': 95,
1039 }, {
1040 'note': 'issue #673',
1041 'url': 'PLBB231211A4F62143',
1042 'info_dict': {
1043 'title': 'Team Fortress 2 (Class-based LP)',
1044 },
1045 'playlist_mincount': 26,
1046 }, {
1047 'note': 'Large playlist',
1048 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1049 'info_dict': {
1050 'title': 'Uploads from Cauchemar',
1051 },
1052 'playlist_mincount': 799,
1053 }, {
1054 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1055 'info_dict': {
1056 'title': 'YDL_safe_search',
1057 },
1058 'playlist_count': 2,
1059 }]
1060
1061 def _real_initialize(self):
1062 self._login()
1063
1064 def _ids_to_results(self, ids):
1065 return [
1066 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1067 for vid_id in ids]
1068
1069 def _extract_mix(self, playlist_id):
1070 # The mixes are generated from a a single video
1071 # the id of the playlist is just 'RD' + video_id
1072 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1073 webpage = self._download_webpage(
1074 url, playlist_id, 'Downloading Youtube mix')
1075 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1076 title_span = (
1077 search_title('playlist-title') or
1078 search_title('title long-title') or
1079 search_title('title'))
1080 title = clean_html(title_span)
1081 ids = orderedSet(re.findall(
1082 r'''(?xs)data-video-username=".*?".*?
1083 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1084 webpage))
1085 url_results = self._ids_to_results(ids)
1086
1087 return self.playlist_result(url_results, playlist_id, title)
1088
1089 def _real_extract(self, url):
1090 # Extract playlist id
1091 mobj = re.match(self._VALID_URL, url)
1092 if mobj is None:
1093 raise ExtractorError(u'Invalid URL: %s' % url)
1094 playlist_id = mobj.group(1) or mobj.group(2)
1095
1096 # Check if it's a video-specific URL
1097 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1098 if 'v' in query_dict:
1099 video_id = query_dict['v'][0]
1100 if self._downloader.params.get('noplaylist'):
1101 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1102 return self.url_result(video_id, 'Youtube', video_id=video_id)
1103 else:
1104 self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1105
1106 if playlist_id.startswith('RD'):
1107 # Mixes require a custom extraction process
1108 return self._extract_mix(playlist_id)
1109 if playlist_id.startswith('TL'):
1110 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1111 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1112
1113 url = self._TEMPLATE_URL % playlist_id
1114 page = self._download_webpage(url, playlist_id)
1115 more_widget_html = content_html = page
1116
1117 # Check if the playlist exists or is private
1118 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1119 raise ExtractorError(
1120 'The playlist doesn\'t exist or is private, use --username or '
1121 '--netrc to access it.',
1122 expected=True)
1123
1124 # Extract the video ids from the playlist pages
1125 ids = []
1126
1127 for page_num in itertools.count(1):
1128 matches = re.finditer(self._VIDEO_RE, content_html)
1129 # We remove the duplicates and the link with index 0
1130 # (it's not the first video of the playlist)
1131 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1132 ids.extend(new_ids)
1133
1134 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1135 if not mobj:
1136 break
1137
1138 more = self._download_json(
1139 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1140 'Downloading page #%s' % page_num,
1141 transform_source=uppercase_escape)
1142 content_html = more['content_html']
1143 more_widget_html = more['load_more_widget_html']
1144
1145 playlist_title = self._html_search_regex(
1146 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1147 page, 'title')
1148
1149 url_results = self._ids_to_results(ids)
1150 return self.playlist_result(url_results, playlist_id, playlist_title)
1151
1152
1153 class YoutubeTopListIE(YoutubePlaylistIE):
1154 IE_NAME = 'youtube:toplist'
1155 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1156 ' (Example: "yttoplist:music:Top Tracks")')
1157 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1158 _TESTS = []
1159
1160 def _real_extract(self, url):
1161 mobj = re.match(self._VALID_URL, url)
1162 channel = mobj.group('chann')
1163 title = mobj.group('title')
1164 query = compat_urllib_parse.urlencode({'title': title})
1165 playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
1166 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1167 link = self._html_search_regex(playlist_re, channel_page, 'list')
1168 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1169
1170 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1171 ids = []
1172 # sometimes the webpage doesn't contain the videos
1173 # retry until we get them
1174 for i in itertools.count(0):
1175 msg = 'Downloading Youtube mix'
1176 if i > 0:
1177 msg += ', retry #%d' % i
1178
1179 webpage = self._download_webpage(url, title, msg)
1180 ids = orderedSet(re.findall(video_re, webpage))
1181 if ids:
1182 break
1183 url_results = self._ids_to_results(ids)
1184 return self.playlist_result(url_results, playlist_title=title)
1185
1186
1187 class YoutubeChannelIE(InfoExtractor):
1188 IE_DESC = 'YouTube.com channels'
1189 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1190 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1191 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1192 IE_NAME = 'youtube:channel'
1193
1194 def extract_videos_from_page(self, page):
1195 ids_in_page = []
1196 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1197 if mobj.group(1) not in ids_in_page:
1198 ids_in_page.append(mobj.group(1))
1199 return ids_in_page
1200
1201 def _real_extract(self, url):
1202 # Extract channel id
1203 mobj = re.match(self._VALID_URL, url)
1204 if mobj is None:
1205 raise ExtractorError(u'Invalid URL: %s' % url)
1206
1207 # Download channel page
1208 channel_id = mobj.group(1)
1209 video_ids = []
1210 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1211 channel_page = self._download_webpage(url, channel_id)
1212 autogenerated = re.search(r'''(?x)
1213 class="[^"]*?(?:
1214 channel-header-autogenerated-label|
1215 yt-channel-title-autogenerated
1216 )[^"]*"''', channel_page) is not None
1217
1218 if autogenerated:
1219 # The videos are contained in a single page
1220 # the ajax pages can't be used, they are empty
1221 video_ids = self.extract_videos_from_page(channel_page)
1222 else:
1223 # Download all channel pages using the json-based channel_ajax query
1224 for pagenum in itertools.count(1):
1225 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1226 page = self._download_json(
1227 url, channel_id, note=u'Downloading page #%s' % pagenum,
1228 transform_source=uppercase_escape)
1229
1230 ids_in_page = self.extract_videos_from_page(page['content_html'])
1231 video_ids.extend(ids_in_page)
1232
1233 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1234 break
1235
1236 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1237
1238 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1239 for video_id in video_ids]
1240 return self.playlist_result(url_entries, channel_id)
1241
1242
1243 class YoutubeUserIE(InfoExtractor):
1244 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
1245 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1246 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1247 _GDATA_PAGE_SIZE = 50
1248 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1249 IE_NAME = 'youtube:user'
1250
1251 @classmethod
1252 def suitable(cls, url):
1253 # Don't return True if the url can be extracted with other youtube
1254 # extractor, the regex would is too permissive and it would match.
1255 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1256 if any(ie.suitable(url) for ie in other_ies): return False
1257 else: return super(YoutubeUserIE, cls).suitable(url)
1258
1259 def _real_extract(self, url):
1260 # Extract username
1261 mobj = re.match(self._VALID_URL, url)
1262 if mobj is None:
1263 raise ExtractorError(u'Invalid URL: %s' % url)
1264
1265 username = mobj.group(1)
1266
1267 # Download video ids using YouTube Data API. Result size per
1268 # query is limited (currently to 50 videos) so we need to query
1269 # page by page until there are no video ids - it means we got
1270 # all of them.
1271
1272 def download_page(pagenum):
1273 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1274
1275 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1276 page = self._download_webpage(
1277 gdata_url, username,
1278 'Downloading video ids from %d to %d' % (
1279 start_index, start_index + self._GDATA_PAGE_SIZE))
1280
1281 try:
1282 response = json.loads(page)
1283 except ValueError as err:
1284 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1285 if 'entry' not in response['feed']:
1286 return
1287
1288 # Extract video identifiers
1289 entries = response['feed']['entry']
1290 for entry in entries:
1291 title = entry['title']['$t']
1292 video_id = entry['id']['$t'].split('/')[-1]
1293 yield {
1294 '_type': 'url',
1295 'url': video_id,
1296 'ie_key': 'Youtube',
1297 'id': video_id,
1298 'title': title,
1299 }
1300 url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
1301
1302 return self.playlist_result(url_results, playlist_title=username)
1303
1304
1305 class YoutubeSearchIE(SearchInfoExtractor):
1306 IE_DESC = 'YouTube.com searches'
1307 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1308 _MAX_RESULTS = 1000
1309 IE_NAME = 'youtube:search'
1310 _SEARCH_KEY = 'ytsearch'
1311
1312 def _get_n_results(self, query, n):
1313 """Get a specified number of results for a query"""
1314
1315 video_ids = []
1316 pagenum = 0
1317 limit = n
1318 PAGE_SIZE = 50
1319
1320 while (PAGE_SIZE * pagenum) < limit:
1321 result_url = self._API_URL % (
1322 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1323 (PAGE_SIZE * pagenum) + 1)
1324 data_json = self._download_webpage(
1325 result_url, video_id=u'query "%s"' % query,
1326 note=u'Downloading page %s' % (pagenum + 1),
1327 errnote=u'Unable to download API page')
1328 data = json.loads(data_json)
1329 api_response = data['data']
1330
1331 if 'items' not in api_response:
1332 raise ExtractorError(
1333 '[youtube] No video results', expected=True)
1334
1335 new_ids = list(video['id'] for video in api_response['items'])
1336 video_ids += new_ids
1337
1338 limit = min(n, api_response['totalItems'])
1339 pagenum += 1
1340
1341 if len(video_ids) > n:
1342 video_ids = video_ids[:n]
1343 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1344 for video_id in video_ids]
1345 return self.playlist_result(videos, query)
1346
1347
1348 class YoutubeSearchDateIE(YoutubeSearchIE):
1349 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1350 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1351 _SEARCH_KEY = 'ytsearchdate'
1352 IE_DESC = 'YouTube.com searches, newest videos first'
1353
1354
1355 class YoutubeSearchURLIE(InfoExtractor):
1356 IE_DESC = 'YouTube.com search URLs'
1357 IE_NAME = 'youtube:search_url'
1358 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1359
1360 def _real_extract(self, url):
1361 mobj = re.match(self._VALID_URL, url)
1362 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1363
1364 webpage = self._download_webpage(url, query)
1365 result_code = self._search_regex(
1366 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
1367
1368 part_codes = re.findall(
1369 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1370 entries = []
1371 for part_code in part_codes:
1372 part_title = self._html_search_regex(
1373 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1374 part_url_snippet = self._html_search_regex(
1375 r'(?s)href="([^"]+)"', part_code, 'item URL')
1376 part_url = compat_urlparse.urljoin(
1377 'https://www.youtube.com/', part_url_snippet)
1378 entries.append({
1379 '_type': 'url',
1380 'url': part_url,
1381 'title': part_title,
1382 })
1383
1384 return {
1385 '_type': 'playlist',
1386 'entries': entries,
1387 'title': query,
1388 }
1389
1390
1391 class YoutubeShowIE(InfoExtractor):
1392 IE_DESC = 'YouTube.com (multi-season) shows'
1393 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1394 IE_NAME = 'youtube:show'
1395
1396 def _real_extract(self, url):
1397 mobj = re.match(self._VALID_URL, url)
1398 show_name = mobj.group(1)
1399 webpage = self._download_webpage(url, show_name, 'Downloading show webpage')
1400 # There's one playlist for each season of the show
1401 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1402 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1403 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1404
1405
1406 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1407 """
1408 Base class for extractors that fetch info from
1409 http://www.youtube.com/feed_ajax
1410 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1411 """
1412 _LOGIN_REQUIRED = True
1413 # use action_load_personal_feed instead of action_load_system_feed
1414 _PERSONAL_FEED = False
1415
1416 @property
1417 def _FEED_TEMPLATE(self):
1418 action = 'action_load_system_feed'
1419 if self._PERSONAL_FEED:
1420 action = 'action_load_personal_feed'
1421 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1422
1423 @property
1424 def IE_NAME(self):
1425 return 'youtube:%s' % self._FEED_NAME
1426
1427 def _real_initialize(self):
1428 self._login()
1429
1430 def _real_extract(self, url):
1431 feed_entries = []
1432 paging = 0
1433 for i in itertools.count(1):
1434 info = self._download_json(self._FEED_TEMPLATE % paging,
1435 '%s feed' % self._FEED_NAME,
1436 'Downloading page %s' % i)
1437 feed_html = info.get('feed_html') or info.get('content_html')
1438 load_more_widget_html = info.get('load_more_widget_html') or feed_html
1439 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1440 ids = orderedSet(m.group(1) for m in m_ids)
1441 feed_entries.extend(
1442 self.url_result(video_id, 'Youtube', video_id=video_id)
1443 for video_id in ids)
1444 mobj = re.search(
1445 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1446 load_more_widget_html)
1447 if mobj is None:
1448 break
1449 paging = mobj.group('paging')
1450 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1451
1452 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1453 IE_DESC = 'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1454 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1455 _FEED_NAME = 'recommended'
1456 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1457
1458 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1459 IE_DESC = 'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1460 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1461 _FEED_NAME = 'watch_later'
1462 _PLAYLIST_TITLE = 'Youtube Watch Later'
1463 _PERSONAL_FEED = True
1464
1465 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1466 IE_DESC = 'Youtube watch history, "ythistory" keyword (requires authentication)'
1467 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1468 _FEED_NAME = 'history'
1469 _PERSONAL_FEED = True
1470 _PLAYLIST_TITLE = 'Youtube Watch History'
1471
1472 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1473 IE_NAME = 'youtube:favorites'
1474 IE_DESC = 'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1475 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1476 _LOGIN_REQUIRED = True
1477
1478 def _real_extract(self, url):
1479 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1480 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
1481 return self.url_result(playlist_id, 'YoutubePlaylist')
1482
1483
1484 class YoutubeSubscriptionsIE(YoutubePlaylistIE):
1485 IE_NAME = 'youtube:subscriptions'
1486 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1487 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1488 _TESTS = []
1489
1490 def _real_extract(self, url):
1491 title = 'Youtube Subscriptions'
1492 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1493
1494 # The extraction process is the same as for playlists, but the regex
1495 # for the video ids doesn't contain an index
1496 ids = []
1497 more_widget_html = content_html = page
1498
1499 for page_num in itertools.count(1):
1500 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1501 new_ids = orderedSet(matches)
1502 ids.extend(new_ids)
1503
1504 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1505 if not mobj:
1506 break
1507
1508 more = self._download_json(
1509 'https://youtube.com/%s' % mobj.group('more'), title,
1510 'Downloading page #%s' % page_num,
1511 transform_source=uppercase_escape)
1512 content_html = more['content_html']
1513 more_widget_html = more['load_more_widget_html']
1514
1515 return {
1516 '_type': 'playlist',
1517 'title': title,
1518 'entries': self._ids_to_results(ids),
1519 }
1520
1521
1522 class YoutubeTruncatedURLIE(InfoExtractor):
1523 IE_NAME = 'youtube:truncated_url'
1524 IE_DESC = False # Do not list
1525 _VALID_URL = r'''(?x)
1526 (?:https?://)?[^/]+/watch\?(?:
1527 feature=[a-z_]+|
1528 annotation_id=annotation_[^&]+
1529 )?$|
1530 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1531 '''
1532
1533 _TESTS = [{
1534 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1535 'only_matching': True,
1536 }, {
1537 'url': 'http://www.youtube.com/watch?',
1538 'only_matching': True,
1539 }]
1540
1541 def _real_extract(self, url):
1542 raise ExtractorError(
1543 'Did you forget to quote the URL? Remember that & is a meta '
1544 'character in most shells, so you want to put the URL in quotes, '
1545 'like youtube-dl '
1546 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1547 ' or simply youtube-dl BaW_jenozKc .',
1548 expected=True)