]> jfr.im git - yt-dlp.git/blame_incremental - youtube_dl/extractor/youtube.py
Merge branch 'peugeot-tnaflix'
[yt-dlp.git] / youtube_dl / extractor / youtube.py
... / ...
CommitLineData
1# coding: utf-8
2
3import errno
4import io
5import itertools
6import json
7import os.path
8import re
9import traceback
10
11from .common import InfoExtractor, SearchInfoExtractor
12from .subtitles import SubtitlesInfoExtractor
13from ..jsinterp import JSInterpreter
14from ..swfinterp import SWFInterpreter
15from ..utils import (
16 compat_chr,
17 compat_parse_qs,
18 compat_urllib_parse,
19 compat_urllib_request,
20 compat_urlparse,
21 compat_str,
22
23 clean_html,
24 get_cachedir,
25 get_element_by_id,
26 get_element_by_attribute,
27 ExtractorError,
28 int_or_none,
29 PagedList,
30 unescapeHTML,
31 unified_strdate,
32 orderedSet,
33 write_json_file,
34 uppercase_escape,
35)
36
37class YoutubeBaseInfoExtractor(InfoExtractor):
38 """Provide base functions for Youtube extractors"""
39 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
40 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
41 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
42 _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
43 _NETRC_MACHINE = 'youtube'
44 # If True it will raise an error if no login info is provided
45 _LOGIN_REQUIRED = False
46
47 def _set_language(self):
48 return bool(self._download_webpage(
49 self._LANG_URL, None,
50 note=u'Setting language', errnote='unable to set language',
51 fatal=False))
52
53 def _login(self):
54 """
55 Attempt to log in to YouTube.
56 True is returned if successful or skipped.
57 False is returned if login failed.
58
59 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
60 """
61 (username, password) = self._get_login_info()
62 # No authentication to be performed
63 if username is None:
64 if self._LOGIN_REQUIRED:
65 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
66 return True
67
68 login_page = self._download_webpage(
69 self._LOGIN_URL, None,
70 note=u'Downloading login page',
71 errnote=u'unable to fetch login page', fatal=False)
72 if login_page is False:
73 return
74
75 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
76 login_page, u'Login GALX parameter')
77
78 # Log in
79 login_form_strs = {
80 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
81 u'Email': username,
82 u'GALX': galx,
83 u'Passwd': password,
84
85 u'PersistentCookie': u'yes',
86 u'_utf8': u'霱',
87 u'bgresponse': u'js_disabled',
88 u'checkConnection': u'',
89 u'checkedDomains': u'youtube',
90 u'dnConn': u'',
91 u'pstMsg': u'0',
92 u'rmShown': u'1',
93 u'secTok': u'',
94 u'signIn': u'Sign in',
95 u'timeStmp': u'',
96 u'service': u'youtube',
97 u'uilel': u'3',
98 u'hl': u'en_US',
99 }
100
101 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
102 # chokes on unicode
103 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
104 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
105
106 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
107 login_results = self._download_webpage(
108 req, None,
109 note=u'Logging in', errnote=u'unable to log in', fatal=False)
110 if login_results is False:
111 return False
112
113 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
114 raise ExtractorError(u'Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
115
116 # Two-Factor
117 # TODO add SMS and phone call support - these require making a request and then prompting the user
118
119 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
120 tfa_code = self._get_tfa_info()
121
122 if tfa_code is None:
123 self._downloader.report_warning(u'Two-factor authentication required. Provide it with --twofactor <code>')
124 self._downloader.report_warning(u'(Note that only TOTP (Google Authenticator App) codes work at this time.)')
125 return False
126
127 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
128
129 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
130 if match is None:
131 self._downloader.report_warning(u'Failed to get secTok - did the page structure change?')
132 secTok = match.group(1)
133 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
134 if match is None:
135 self._downloader.report_warning(u'Failed to get timeStmp - did the page structure change?')
136 timeStmp = match.group(1)
137
138 tfa_form_strs = {
139 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
140 u'smsToken': u'',
141 u'smsUserPin': tfa_code,
142 u'smsVerifyPin': u'Verify',
143
144 u'PersistentCookie': u'yes',
145 u'checkConnection': u'',
146 u'checkedDomains': u'youtube',
147 u'pstMsg': u'1',
148 u'secTok': secTok,
149 u'timeStmp': timeStmp,
150 u'service': u'youtube',
151 u'hl': u'en_US',
152 }
153 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in tfa_form_strs.items())
154 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
155
156 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
157 tfa_results = self._download_webpage(
158 tfa_req, None,
159 note=u'Submitting TFA code', errnote=u'unable to submit tfa', fatal=False)
160
161 if tfa_results is False:
162 return False
163
164 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
165 self._downloader.report_warning(u'Two-factor code expired. Please try again, or use a one-use backup code instead.')
166 return False
167 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
168 self._downloader.report_warning(u'unable to log in - did the page structure change?')
169 return False
170 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
171 self._downloader.report_warning(u'Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
172 return False
173
174 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
175 self._downloader.report_warning(u'unable to log in: bad username or password')
176 return False
177 return True
178
179 def _confirm_age(self):
180 age_form = {
181 'next_url': '/',
182 'action_confirm': 'Confirm',
183 }
184 req = compat_urllib_request.Request(self._AGE_URL,
185 compat_urllib_parse.urlencode(age_form).encode('ascii'))
186
187 self._download_webpage(
188 req, None,
189 note=u'Confirming age', errnote=u'Unable to confirm age')
190 return True
191
192 def _real_initialize(self):
193 if self._downloader is None:
194 return
195 if not self._set_language():
196 return
197 if not self._login():
198 return
199 self._confirm_age()
200
201
202class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
203 IE_DESC = u'YouTube.com'
204 _VALID_URL = r"""(?x)^
205 (
206 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
207 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
208 (?:www\.)?deturl\.com/www\.youtube\.com/|
209 (?:www\.)?pwnyoutube\.com/|
210 (?:www\.)?yourepeat\.com/|
211 tube\.majestyc\.net/|
212 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
213 (?:.*?\#/)? # handle anchor (#/) redirect urls
214 (?: # the various things that can precede the ID:
215 (?:(?:v|embed|e)/) # v/ or embed/ or e/
216 |(?: # or the v= param in all its forms
217 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
218 (?:\?|\#!?) # the params delimiter ? or # or #!
219 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
220 v=
221 )
222 ))
223 |youtu\.be/ # just youtu.be/xxxx
224 |https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
225 )
226 )? # all until now is optional -> you can pass the naked ID
227 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
228 (?(1).+)? # if we found the ID, everything can follow
229 $"""
230 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
231 _formats = {
232 '5': {'ext': 'flv', 'width': 400, 'height': 240},
233 '6': {'ext': 'flv', 'width': 450, 'height': 270},
234 '13': {'ext': '3gp'},
235 '17': {'ext': '3gp', 'width': 176, 'height': 144},
236 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
237 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
238 '34': {'ext': 'flv', 'width': 640, 'height': 360},
239 '35': {'ext': 'flv', 'width': 854, 'height': 480},
240 '36': {'ext': '3gp', 'width': 320, 'height': 240},
241 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
242 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
243 '43': {'ext': 'webm', 'width': 640, 'height': 360},
244 '44': {'ext': 'webm', 'width': 854, 'height': 480},
245 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
246 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
247
248
249 # 3d videos
250 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
251 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
252 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
253 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
254 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
255 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
256 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
257
258 # Apple HTTP Live Streaming
259 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
260 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
261 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
262 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
263 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
264 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
265 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
266
267 # DASH mp4 video
268 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
269 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
270 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
271 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
272 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
273 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
274 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
275 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
276
277 # Dash mp4 audio
278 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
279 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
280 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
281
282 # Dash webm
283 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
284 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
285 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
286 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
287 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
288 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
289 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
290 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
291 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
292 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
293 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
294 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
295 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
296 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
297 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
298
299 # Dash webm audio
300 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
301 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
302
303 # RTMP (unnamed)
304 '_rtmp': {'protocol': 'rtmp'},
305 }
306
307 IE_NAME = u'youtube'
308 _TESTS = [
309 {
310 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
311 u"file": u"BaW_jenozKc.mp4",
312 u"info_dict": {
313 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
314 u"uploader": u"Philipp Hagemeister",
315 u"uploader_id": u"phihag",
316 u"upload_date": u"20121002",
317 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
318 u"categories": [u'Science & Technology'],
319 'like_count': int,
320 'dislike_count': int,
321 }
322 },
323 {
324 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
325 u"file": u"UxxajLWwzqY.mp4",
326 u"note": u"Test generic use_cipher_signature video (#897)",
327 u"info_dict": {
328 u"upload_date": u"20120506",
329 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
330 u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f",
331 u"uploader": u"Icona Pop",
332 u"uploader_id": u"IconaPop"
333 }
334 },
335 {
336 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
337 u"file": u"07FYdnEawAQ.mp4",
338 u"note": u"Test VEVO video with age protection (#956)",
339 u"info_dict": {
340 u"upload_date": u"20130703",
341 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
342 u"description": u"md5:64249768eec3bc4276236606ea996373",
343 u"uploader": u"justintimberlakeVEVO",
344 u"uploader_id": u"justintimberlakeVEVO"
345 }
346 },
347 {
348 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
349 u"file": u"yZIXLfi8CZQ.mp4",
350 u"note": u"Embed-only video (#1746)",
351 u"info_dict": {
352 u"upload_date": u"20120608",
353 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
354 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
355 u"uploader": u"SET India",
356 u"uploader_id": u"setindia"
357 }
358 },
359 {
360 u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
361 u"file": u"a9LDPn-MO4I.m4a",
362 u"note": u"256k DASH audio (format 141) via DASH manifest",
363 u"info_dict": {
364 u"upload_date": "20121002",
365 u"uploader_id": "8KVIDEO",
366 u"description": "No description available.",
367 u"uploader": "8KVIDEO",
368 u"title": "UHDTV TEST 8K VIDEO.mp4"
369 },
370 u"params": {
371 u"youtube_include_dash_manifest": True,
372 u"format": "141",
373 },
374 },
375 # DASH manifest with encrypted signature
376 {
377 u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',
378 u'info_dict': {
379 u'id': u'IB3lcPjvWLA',
380 u'ext': u'm4a',
381 u'title': u'Afrojack - The Spark ft. Spree Wilson',
382 u'description': u'md5:9717375db5a9a3992be4668bbf3bc0a8',
383 u'uploader': u'AfrojackVEVO',
384 u'uploader_id': u'AfrojackVEVO',
385 u'upload_date': u'20131011',
386 },
387 u"params": {
388 u'youtube_include_dash_manifest': True,
389 u'format': '141',
390 },
391 },
392 ]
393
394
395 @classmethod
396 def suitable(cls, url):
397 """Receives a URL and returns True if suitable for this IE."""
398 if YoutubePlaylistIE.suitable(url): return False
399 return re.match(cls._VALID_URL, url) is not None
400
401 def __init__(self, *args, **kwargs):
402 super(YoutubeIE, self).__init__(*args, **kwargs)
403 self._player_cache = {}
404
405 def report_video_info_webpage_download(self, video_id):
406 """Report attempt to download video info webpage."""
407 self.to_screen(u'%s: Downloading video info webpage' % video_id)
408
409 def report_information_extraction(self, video_id):
410 """Report attempt to extract video information."""
411 self.to_screen(u'%s: Extracting video information' % video_id)
412
413 def report_unavailable_format(self, video_id, format):
414 """Report extracted video URL."""
415 self.to_screen(u'%s: Format %s not available' % (video_id, format))
416
417 def report_rtmp_download(self):
418 """Indicate the download will use the RTMP protocol."""
419 self.to_screen(u'RTMP download detected')
420
421 def _signature_cache_id(self, example_sig):
422 """ Return a string representation of a signature """
423 return u'.'.join(compat_str(len(part)) for part in example_sig.split('.'))
424
425 def _extract_signature_function(self, video_id, player_url, example_sig):
426 id_m = re.match(
427 r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
428 player_url)
429 if not id_m:
430 raise ExtractorError('Cannot identify player %r' % player_url)
431 player_type = id_m.group('ext')
432 player_id = id_m.group('id')
433
434 # Read from filesystem cache
435 func_id = '%s_%s_%s' % (
436 player_type, player_id, self._signature_cache_id(example_sig))
437 assert os.path.basename(func_id) == func_id
438 cache_dir = get_cachedir(self._downloader.params)
439
440 cache_enabled = cache_dir is not None
441 if cache_enabled:
442 cache_fn = os.path.join(os.path.expanduser(cache_dir),
443 u'youtube-sigfuncs',
444 func_id + '.json')
445 try:
446 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
447 cache_spec = json.load(cachef)
448 return lambda s: u''.join(s[i] for i in cache_spec)
449 except IOError:
450 pass # No cache available
451 except ValueError:
452 try:
453 file_size = os.path.getsize(cache_fn)
454 except (OSError, IOError) as oe:
455 file_size = str(oe)
456 self._downloader.report_warning(
457 u'Cache %s failed (%s)' % (cache_fn, file_size))
458
459 if player_type == 'js':
460 code = self._download_webpage(
461 player_url, video_id,
462 note=u'Downloading %s player %s' % (player_type, player_id),
463 errnote=u'Download of %s failed' % player_url)
464 res = self._parse_sig_js(code)
465 elif player_type == 'swf':
466 urlh = self._request_webpage(
467 player_url, video_id,
468 note=u'Downloading %s player %s' % (player_type, player_id),
469 errnote=u'Download of %s failed' % player_url)
470 code = urlh.read()
471 res = self._parse_sig_swf(code)
472 else:
473 assert False, 'Invalid player type %r' % player_type
474
475 if cache_enabled:
476 try:
477 test_string = u''.join(map(compat_chr, range(len(example_sig))))
478 cache_res = res(test_string)
479 cache_spec = [ord(c) for c in cache_res]
480 try:
481 os.makedirs(os.path.dirname(cache_fn))
482 except OSError as ose:
483 if ose.errno != errno.EEXIST:
484 raise
485 write_json_file(cache_spec, cache_fn)
486 except Exception:
487 tb = traceback.format_exc()
488 self._downloader.report_warning(
489 u'Writing cache to %r failed: %s' % (cache_fn, tb))
490
491 return res
492
493 def _print_sig_code(self, func, example_sig):
494 def gen_sig_code(idxs):
495 def _genslice(start, end, step):
496 starts = u'' if start == 0 else str(start)
497 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
498 steps = u'' if step == 1 else (u':%d' % step)
499 return u's[%s%s%s]' % (starts, ends, steps)
500
501 step = None
502 start = '(Never used)' # Quelch pyflakes warnings - start will be
503 # set as soon as step is set
504 for i, prev in zip(idxs[1:], idxs[:-1]):
505 if step is not None:
506 if i - prev == step:
507 continue
508 yield _genslice(start, prev, step)
509 step = None
510 continue
511 if i - prev in [-1, 1]:
512 step = i - prev
513 start = prev
514 continue
515 else:
516 yield u's[%d]' % prev
517 if step is None:
518 yield u's[%d]' % i
519 else:
520 yield _genslice(start, i, step)
521
522 test_string = u''.join(map(compat_chr, range(len(example_sig))))
523 cache_res = func(test_string)
524 cache_spec = [ord(c) for c in cache_res]
525 expr_code = u' + '.join(gen_sig_code(cache_spec))
526 signature_id_tuple = '(%s)' % (
527 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
528 code = (u'if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
529 u' return %s\n') % (signature_id_tuple, expr_code)
530 self.to_screen(u'Extracted signature function:\n' + code)
531
532 def _parse_sig_js(self, jscode):
533 funcname = self._search_regex(
534 r'signature=([$a-zA-Z]+)', jscode,
535 u'Initial JS player signature function name')
536
537 jsi = JSInterpreter(jscode)
538 initial_function = jsi.extract_function(funcname)
539 return lambda s: initial_function([s])
540
541 def _parse_sig_swf(self, file_contents):
542 swfi = SWFInterpreter(file_contents)
543 TARGET_CLASSNAME = u'SignatureDecipher'
544 searched_class = swfi.extract_class(TARGET_CLASSNAME)
545 initial_function = swfi.extract_function(searched_class, u'decipher')
546 return lambda s: initial_function([s])
547
548 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
549 """Turn the encrypted s field into a working signature"""
550
551 if player_url is None:
552 raise ExtractorError(u'Cannot decrypt signature without player_url')
553
554 if player_url.startswith(u'//'):
555 player_url = u'https:' + player_url
556 try:
557 player_id = (player_url, self._signature_cache_id(s))
558 if player_id not in self._player_cache:
559 func = self._extract_signature_function(
560 video_id, player_url, s
561 )
562 self._player_cache[player_id] = func
563 func = self._player_cache[player_id]
564 if self._downloader.params.get('youtube_print_sig_code'):
565 self._print_sig_code(func, s)
566 return func(s)
567 except Exception as e:
568 tb = traceback.format_exc()
569 raise ExtractorError(
570 u'Signature extraction failed: ' + tb, cause=e)
571
572 def _get_available_subtitles(self, video_id, webpage):
573 try:
574 sub_list = self._download_webpage(
575 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
576 video_id, note=False)
577 except ExtractorError as err:
578 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
579 return {}
580 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
581
582 sub_lang_list = {}
583 for l in lang_list:
584 lang = l[1]
585 if lang in sub_lang_list:
586 continue
587 params = compat_urllib_parse.urlencode({
588 'lang': lang,
589 'v': video_id,
590 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
591 'name': unescapeHTML(l[0]).encode('utf-8'),
592 })
593 url = u'https://www.youtube.com/api/timedtext?' + params
594 sub_lang_list[lang] = url
595 if not sub_lang_list:
596 self._downloader.report_warning(u'video doesn\'t have subtitles')
597 return {}
598 return sub_lang_list
599
600 def _get_available_automatic_caption(self, video_id, webpage):
601 """We need the webpage for getting the captions url, pass it as an
602 argument to speed up the process."""
603 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
604 self.to_screen(u'%s: Looking for automatic captions' % video_id)
605 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
606 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
607 if mobj is None:
608 self._downloader.report_warning(err_msg)
609 return {}
610 player_config = json.loads(mobj.group(1))
611 try:
612 args = player_config[u'args']
613 caption_url = args[u'ttsurl']
614 timestamp = args[u'timestamp']
615 # We get the available subtitles
616 list_params = compat_urllib_parse.urlencode({
617 'type': 'list',
618 'tlangs': 1,
619 'asrs': 1,
620 })
621 list_url = caption_url + '&' + list_params
622 caption_list = self._download_xml(list_url, video_id)
623 original_lang_node = caption_list.find('track')
624 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
625 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
626 return {}
627 original_lang = original_lang_node.attrib['lang_code']
628
629 sub_lang_list = {}
630 for lang_node in caption_list.findall('target'):
631 sub_lang = lang_node.attrib['lang_code']
632 params = compat_urllib_parse.urlencode({
633 'lang': original_lang,
634 'tlang': sub_lang,
635 'fmt': sub_format,
636 'ts': timestamp,
637 'kind': 'asr',
638 })
639 sub_lang_list[sub_lang] = caption_url + '&' + params
640 return sub_lang_list
641 # An extractor error can be raise by the download process if there are
642 # no automatic captions but there are subtitles
643 except (KeyError, ExtractorError):
644 self._downloader.report_warning(err_msg)
645 return {}
646
647 @classmethod
648 def extract_id(cls, url):
649 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
650 if mobj is None:
651 raise ExtractorError(u'Invalid URL: %s' % url)
652 video_id = mobj.group(2)
653 return video_id
654
655 def _extract_from_m3u8(self, manifest_url, video_id):
656 url_map = {}
657 def _get_urls(_manifest):
658 lines = _manifest.split('\n')
659 urls = filter(lambda l: l and not l.startswith('#'),
660 lines)
661 return urls
662 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
663 formats_urls = _get_urls(manifest)
664 for format_url in formats_urls:
665 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
666 url_map[itag] = format_url
667 return url_map
668
669 def _extract_annotations(self, video_id):
670 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
671 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
672
673 def _real_extract(self, url):
674 proto = (
675 u'http' if self._downloader.params.get('prefer_insecure', False)
676 else u'https')
677
678 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
679 mobj = re.search(self._NEXT_URL_RE, url)
680 if mobj:
681 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
682 video_id = self.extract_id(url)
683
684 # Get video webpage
685 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
686 video_webpage = self._download_webpage(url, video_id)
687
688 # Attempt to extract SWF player URL
689 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
690 if mobj is not None:
691 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
692 else:
693 player_url = None
694
695 # Get video info
696 self.report_video_info_webpage_download(video_id)
697 if re.search(r'player-age-gate-content">', video_webpage) is not None:
698 self.report_age_confirmation()
699 age_gate = True
700 # We simulate the access to the video from www.youtube.com/v/{video_id}
701 # this can be viewed without login into Youtube
702 data = compat_urllib_parse.urlencode({
703 'video_id': video_id,
704 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
705 'sts': self._search_regex(
706 r'"sts"\s*:\s*(\d+)', video_webpage, 'sts'),
707 })
708 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
709 video_info_webpage = self._download_webpage(video_info_url, video_id,
710 note=False,
711 errnote='unable to download video info webpage')
712 video_info = compat_parse_qs(video_info_webpage)
713 else:
714 age_gate = False
715 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
716 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
717 % (video_id, el_type))
718 video_info_webpage = self._download_webpage(video_info_url, video_id,
719 note=False,
720 errnote='unable to download video info webpage')
721 video_info = compat_parse_qs(video_info_webpage)
722 if 'token' in video_info:
723 break
724 if 'token' not in video_info:
725 if 'reason' in video_info:
726 raise ExtractorError(
727 u'YouTube said: %s' % video_info['reason'][0],
728 expected=True, video_id=video_id)
729 else:
730 raise ExtractorError(
731 u'"token" parameter not in video info for unknown reason',
732 video_id=video_id)
733
734 if 'view_count' in video_info:
735 view_count = int(video_info['view_count'][0])
736 else:
737 view_count = None
738
739 # Check for "rental" videos
740 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
741 raise ExtractorError(u'"rental" videos not supported')
742
743 # Start extracting information
744 self.report_information_extraction(video_id)
745
746 # uploader
747 if 'author' not in video_info:
748 raise ExtractorError(u'Unable to extract uploader name')
749 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
750
751 # uploader_id
752 video_uploader_id = None
753 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
754 if mobj is not None:
755 video_uploader_id = mobj.group(1)
756 else:
757 self._downloader.report_warning(u'unable to extract uploader nickname')
758
759 # title
760 if 'title' in video_info:
761 video_title = video_info['title'][0]
762 else:
763 self._downloader.report_warning(u'Unable to extract video title')
764 video_title = u'_'
765
766 # thumbnail image
767 # We try first to get a high quality image:
768 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
769 video_webpage, re.DOTALL)
770 if m_thumb is not None:
771 video_thumbnail = m_thumb.group(1)
772 elif 'thumbnail_url' not in video_info:
773 self._downloader.report_warning(u'unable to extract video thumbnail')
774 video_thumbnail = None
775 else: # don't panic if we can't find it
776 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
777
778 # upload date
779 upload_date = None
780 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
781 if mobj is None:
782 mobj = re.search(
783 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
784 video_webpage)
785 if mobj is not None:
786 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
787 upload_date = unified_strdate(upload_date)
788
789 m_cat_container = self._search_regex(
790 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
791 video_webpage, 'categories', fatal=False)
792 if m_cat_container:
793 category = self._html_search_regex(
794 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
795 default=None)
796 video_categories = None if category is None else [category]
797 else:
798 video_categories = None
799
800 # description
801 video_description = get_element_by_id("eow-description", video_webpage)
802 if video_description:
803 video_description = re.sub(r'''(?x)
804 <a\s+
805 (?:[a-zA-Z-]+="[^"]+"\s+)*?
806 title="([^"]+)"\s+
807 (?:[a-zA-Z-]+="[^"]+"\s+)*?
808 class="yt-uix-redirect-link"\s*>
809 [^<]+
810 </a>
811 ''', r'\1', video_description)
812 video_description = clean_html(video_description)
813 else:
814 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
815 if fd_mobj:
816 video_description = unescapeHTML(fd_mobj.group(1))
817 else:
818 video_description = u''
819
820 def _extract_count(count_name):
821 count = self._search_regex(
822 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
823 video_webpage, count_name, default=None)
824 if count is not None:
825 return int(count.replace(',', ''))
826 return None
827 like_count = _extract_count(u'like')
828 dislike_count = _extract_count(u'dislike')
829
830 # subtitles
831 video_subtitles = self.extract_subtitles(video_id, video_webpage)
832
833 if self._downloader.params.get('listsubtitles', False):
834 self._list_available_subtitles(video_id, video_webpage)
835 return
836
837 if 'length_seconds' not in video_info:
838 self._downloader.report_warning(u'unable to extract video duration')
839 video_duration = None
840 else:
841 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
842
843 # annotations
844 video_annotations = None
845 if self._downloader.params.get('writeannotations', False):
846 video_annotations = self._extract_annotations(video_id)
847
848 # Decide which formats to download
849 try:
850 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
851 if not mobj:
852 raise ValueError('Could not find vevo ID')
853 json_code = uppercase_escape(mobj.group(1))
854 ytplayer_config = json.loads(json_code)
855 args = ytplayer_config['args']
856 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
857 # this signatures are encrypted
858 if 'url_encoded_fmt_stream_map' not in args:
859 raise ValueError(u'No stream_map present') # caught below
860 re_signature = re.compile(r'[&,]s=')
861 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
862 if m_s is not None:
863 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
864 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
865 m_s = re_signature.search(args.get('adaptive_fmts', u''))
866 if m_s is not None:
867 if 'adaptive_fmts' in video_info:
868 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
869 else:
870 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
871 except ValueError:
872 pass
873
874 def _map_to_format_list(urlmap):
875 formats = []
876 for itag, video_real_url in urlmap.items():
877 dct = {
878 'format_id': itag,
879 'url': video_real_url,
880 'player_url': player_url,
881 }
882 if itag in self._formats:
883 dct.update(self._formats[itag])
884 formats.append(dct)
885 return formats
886
887 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
888 self.report_rtmp_download()
889 formats = [{
890 'format_id': '_rtmp',
891 'protocol': 'rtmp',
892 'url': video_info['conn'][0],
893 'player_url': player_url,
894 }]
895 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
896 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
897 if 'rtmpe%3Dyes' in encoded_url_map:
898 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
899 url_map = {}
900 for url_data_str in encoded_url_map.split(','):
901 url_data = compat_parse_qs(url_data_str)
902 if 'itag' not in url_data or 'url' not in url_data:
903 continue
904 format_id = url_data['itag'][0]
905 url = url_data['url'][0]
906
907 if 'sig' in url_data:
908 url += '&signature=' + url_data['sig'][0]
909 elif 's' in url_data:
910 encrypted_sig = url_data['s'][0]
911
912 if not age_gate:
913 jsplayer_url_json = self._search_regex(
914 r'"assets":.+?"js":\s*("[^"]+")',
915 video_webpage, u'JS player URL')
916 player_url = json.loads(jsplayer_url_json)
917 if player_url is None:
918 player_url_json = self._search_regex(
919 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
920 video_webpage, u'age gate player URL')
921 player_url = json.loads(player_url_json)
922
923 if self._downloader.params.get('verbose'):
924 if player_url is None:
925 player_version = 'unknown'
926 player_desc = 'unknown'
927 else:
928 if player_url.endswith('swf'):
929 player_version = self._search_regex(
930 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
931 u'flash player', fatal=False)
932 player_desc = 'flash player %s' % player_version
933 else:
934 player_version = self._search_regex(
935 r'html5player-([^/]+?)(?:/html5player)?\.js',
936 player_url,
937 'html5 player', fatal=False)
938 player_desc = u'html5 player %s' % player_version
939
940 parts_sizes = self._signature_cache_id(encrypted_sig)
941 self.to_screen(u'{%s} signature length %s, %s' %
942 (format_id, parts_sizes, player_desc))
943
944 signature = self._decrypt_signature(
945 encrypted_sig, video_id, player_url, age_gate)
946 url += '&signature=' + signature
947 if 'ratebypass' not in url:
948 url += '&ratebypass=yes'
949 url_map[format_id] = url
950 formats = _map_to_format_list(url_map)
951 elif video_info.get('hlsvp'):
952 manifest_url = video_info['hlsvp'][0]
953 url_map = self._extract_from_m3u8(manifest_url, video_id)
954 formats = _map_to_format_list(url_map)
955 else:
956 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
957
958 # Look for the DASH manifest
959 if (self._downloader.params.get('youtube_include_dash_manifest', False)):
960 try:
961 # The DASH manifest used needs to be the one from the original video_webpage.
962 # The one found in get_video_info seems to be using different signatures.
963 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
964 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
965 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
966 if age_gate:
967 dash_manifest_url = video_info.get('dashmpd')[0]
968 else:
969 dash_manifest_url = ytplayer_config['args']['dashmpd']
970 def decrypt_sig(mobj):
971 s = mobj.group(1)
972 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
973 return '/signature/%s' % dec_s
974 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
975 dash_doc = self._download_xml(
976 dash_manifest_url, video_id,
977 note=u'Downloading DASH manifest',
978 errnote=u'Could not download DASH manifest')
979 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
980 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
981 if url_el is None:
982 continue
983 format_id = r.attrib['id']
984 video_url = url_el.text
985 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
986 f = {
987 'format_id': format_id,
988 'url': video_url,
989 'width': int_or_none(r.attrib.get('width')),
990 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
991 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
992 'filesize': filesize,
993 }
994 try:
995 existing_format = next(
996 fo for fo in formats
997 if fo['format_id'] == format_id)
998 except StopIteration:
999 f.update(self._formats.get(format_id, {}))
1000 formats.append(f)
1001 else:
1002 existing_format.update(f)
1003
1004 except (ExtractorError, KeyError) as e:
1005 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
1006
1007 self._sort_formats(formats)
1008
1009 return {
1010 'id': video_id,
1011 'uploader': video_uploader,
1012 'uploader_id': video_uploader_id,
1013 'upload_date': upload_date,
1014 'title': video_title,
1015 'thumbnail': video_thumbnail,
1016 'description': video_description,
1017 'categories': video_categories,
1018 'subtitles': video_subtitles,
1019 'duration': video_duration,
1020 'age_limit': 18 if age_gate else 0,
1021 'annotations': video_annotations,
1022 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1023 'view_count': view_count,
1024 'like_count': like_count,
1025 'dislike_count': dislike_count,
1026 'formats': formats,
1027 }
1028
1029class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1030 IE_DESC = u'YouTube.com playlists'
1031 _VALID_URL = r"""(?x)(?:
1032 (?:https?://)?
1033 (?:\w+\.)?
1034 youtube\.com/
1035 (?:
1036 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1037 \? (?:.*?&)*? (?:p|a|list)=
1038 | p/
1039 )
1040 (
1041 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1042 # Top tracks, they can also include dots
1043 |(?:MC)[\w\.]*
1044 )
1045 .*
1046 |
1047 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1048 )"""
1049 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1050 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1051 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1052 IE_NAME = u'youtube:playlist'
1053
1054 def _real_initialize(self):
1055 self._login()
1056
1057 def _ids_to_results(self, ids):
1058 return [
1059 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1060 for vid_id in ids]
1061
1062 def _extract_mix(self, playlist_id):
1063 # The mixes are generated from a a single video
1064 # the id of the playlist is just 'RD' + video_id
1065 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1066 webpage = self._download_webpage(
1067 url, playlist_id, u'Downloading Youtube mix')
1068 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1069 title_span = (
1070 search_title('playlist-title') or
1071 search_title('title long-title') or
1072 search_title('title'))
1073 title = clean_html(title_span)
1074 ids = orderedSet(re.findall(
1075 r'''(?xs)data-video-username=".*?".*?
1076 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1077 webpage))
1078 url_results = self._ids_to_results(ids)
1079
1080 return self.playlist_result(url_results, playlist_id, title)
1081
1082 def _real_extract(self, url):
1083 # Extract playlist id
1084 mobj = re.match(self._VALID_URL, url)
1085 if mobj is None:
1086 raise ExtractorError(u'Invalid URL: %s' % url)
1087 playlist_id = mobj.group(1) or mobj.group(2)
1088
1089 # Check if it's a video-specific URL
1090 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1091 if 'v' in query_dict:
1092 video_id = query_dict['v'][0]
1093 if self._downloader.params.get('noplaylist'):
1094 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1095 return self.url_result(video_id, 'Youtube', video_id=video_id)
1096 else:
1097 self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1098
1099 if playlist_id.startswith('RD'):
1100 # Mixes require a custom extraction process
1101 return self._extract_mix(playlist_id)
1102 if playlist_id.startswith('TL'):
1103 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1104 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1105
1106 url = self._TEMPLATE_URL % playlist_id
1107 page = self._download_webpage(url, playlist_id)
1108 more_widget_html = content_html = page
1109
1110 # Check if the playlist exists or is private
1111 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1112 raise ExtractorError(
1113 u'The playlist doesn\'t exist or is private, use --username or '
1114 '--netrc to access it.',
1115 expected=True)
1116
1117 # Extract the video ids from the playlist pages
1118 ids = []
1119
1120 for page_num in itertools.count(1):
1121 matches = re.finditer(self._VIDEO_RE, content_html)
1122 # We remove the duplicates and the link with index 0
1123 # (it's not the first video of the playlist)
1124 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1125 ids.extend(new_ids)
1126
1127 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1128 if not mobj:
1129 break
1130
1131 more = self._download_json(
1132 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1133 'Downloading page #%s' % page_num,
1134 transform_source=uppercase_escape)
1135 content_html = more['content_html']
1136 more_widget_html = more['load_more_widget_html']
1137
1138 playlist_title = self._html_search_regex(
1139 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1140 page, u'title')
1141
1142 url_results = self._ids_to_results(ids)
1143 return self.playlist_result(url_results, playlist_id, playlist_title)
1144
1145
1146class YoutubeTopListIE(YoutubePlaylistIE):
1147 IE_NAME = u'youtube:toplist'
1148 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1149 u' (Example: "yttoplist:music:Top Tracks")')
1150 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1151
1152 def _real_extract(self, url):
1153 mobj = re.match(self._VALID_URL, url)
1154 channel = mobj.group('chann')
1155 title = mobj.group('title')
1156 query = compat_urllib_parse.urlencode({'title': title})
1157 playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
1158 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1159 link = self._html_search_regex(playlist_re, channel_page, u'list')
1160 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1161
1162 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1163 ids = []
1164 # sometimes the webpage doesn't contain the videos
1165 # retry until we get them
1166 for i in itertools.count(0):
1167 msg = u'Downloading Youtube mix'
1168 if i > 0:
1169 msg += ', retry #%d' % i
1170
1171 webpage = self._download_webpage(url, title, msg)
1172 ids = orderedSet(re.findall(video_re, webpage))
1173 if ids:
1174 break
1175 url_results = self._ids_to_results(ids)
1176 return self.playlist_result(url_results, playlist_title=title)
1177
1178
1179class YoutubeChannelIE(InfoExtractor):
1180 IE_DESC = u'YouTube.com channels'
1181 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1182 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1183 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1184 IE_NAME = u'youtube:channel'
1185
1186 def extract_videos_from_page(self, page):
1187 ids_in_page = []
1188 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1189 if mobj.group(1) not in ids_in_page:
1190 ids_in_page.append(mobj.group(1))
1191 return ids_in_page
1192
1193 def _real_extract(self, url):
1194 # Extract channel id
1195 mobj = re.match(self._VALID_URL, url)
1196 if mobj is None:
1197 raise ExtractorError(u'Invalid URL: %s' % url)
1198
1199 # Download channel page
1200 channel_id = mobj.group(1)
1201 video_ids = []
1202 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1203 channel_page = self._download_webpage(url, channel_id)
1204 autogenerated = re.search(r'''(?x)
1205 class="[^"]*?(?:
1206 channel-header-autogenerated-label|
1207 yt-channel-title-autogenerated
1208 )[^"]*"''', channel_page) is not None
1209
1210 if autogenerated:
1211 # The videos are contained in a single page
1212 # the ajax pages can't be used, they are empty
1213 video_ids = self.extract_videos_from_page(channel_page)
1214 else:
1215 # Download all channel pages using the json-based channel_ajax query
1216 for pagenum in itertools.count(1):
1217 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1218 page = self._download_json(
1219 url, channel_id, note=u'Downloading page #%s' % pagenum,
1220 transform_source=uppercase_escape)
1221
1222 ids_in_page = self.extract_videos_from_page(page['content_html'])
1223 video_ids.extend(ids_in_page)
1224
1225 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1226 break
1227
1228 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1229
1230 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1231 for video_id in video_ids]
1232 return self.playlist_result(url_entries, channel_id)
1233
1234
1235class YoutubeUserIE(InfoExtractor):
1236 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1237 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1238 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1239 _GDATA_PAGE_SIZE = 50
1240 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1241 IE_NAME = u'youtube:user'
1242
1243 @classmethod
1244 def suitable(cls, url):
1245 # Don't return True if the url can be extracted with other youtube
1246 # extractor, the regex would is too permissive and it would match.
1247 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1248 if any(ie.suitable(url) for ie in other_ies): return False
1249 else: return super(YoutubeUserIE, cls).suitable(url)
1250
1251 def _real_extract(self, url):
1252 # Extract username
1253 mobj = re.match(self._VALID_URL, url)
1254 if mobj is None:
1255 raise ExtractorError(u'Invalid URL: %s' % url)
1256
1257 username = mobj.group(1)
1258
1259 # Download video ids using YouTube Data API. Result size per
1260 # query is limited (currently to 50 videos) so we need to query
1261 # page by page until there are no video ids - it means we got
1262 # all of them.
1263
1264 def download_page(pagenum):
1265 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1266
1267 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1268 page = self._download_webpage(
1269 gdata_url, username,
1270 u'Downloading video ids from %d to %d' % (
1271 start_index, start_index + self._GDATA_PAGE_SIZE))
1272
1273 try:
1274 response = json.loads(page)
1275 except ValueError as err:
1276 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1277 if 'entry' not in response['feed']:
1278 return
1279
1280 # Extract video identifiers
1281 entries = response['feed']['entry']
1282 for entry in entries:
1283 title = entry['title']['$t']
1284 video_id = entry['id']['$t'].split('/')[-1]
1285 yield {
1286 '_type': 'url',
1287 'url': video_id,
1288 'ie_key': 'Youtube',
1289 'id': video_id,
1290 'title': title,
1291 }
1292 url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
1293
1294 return self.playlist_result(url_results, playlist_title=username)
1295
1296
1297class YoutubeSearchIE(SearchInfoExtractor):
1298 IE_DESC = u'YouTube.com searches'
1299 _API_URL = u'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1300 _MAX_RESULTS = 1000
1301 IE_NAME = u'youtube:search'
1302 _SEARCH_KEY = 'ytsearch'
1303
1304 def _get_n_results(self, query, n):
1305 """Get a specified number of results for a query"""
1306
1307 video_ids = []
1308 pagenum = 0
1309 limit = n
1310 PAGE_SIZE = 50
1311
1312 while (PAGE_SIZE * pagenum) < limit:
1313 result_url = self._API_URL % (
1314 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1315 (PAGE_SIZE * pagenum) + 1)
1316 data_json = self._download_webpage(
1317 result_url, video_id=u'query "%s"' % query,
1318 note=u'Downloading page %s' % (pagenum + 1),
1319 errnote=u'Unable to download API page')
1320 data = json.loads(data_json)
1321 api_response = data['data']
1322
1323 if 'items' not in api_response:
1324 raise ExtractorError(
1325 u'[youtube] No video results', expected=True)
1326
1327 new_ids = list(video['id'] for video in api_response['items'])
1328 video_ids += new_ids
1329
1330 limit = min(n, api_response['totalItems'])
1331 pagenum += 1
1332
1333 if len(video_ids) > n:
1334 video_ids = video_ids[:n]
1335 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1336 for video_id in video_ids]
1337 return self.playlist_result(videos, query)
1338
1339
1340class YoutubeSearchDateIE(YoutubeSearchIE):
1341 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1342 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1343 _SEARCH_KEY = 'ytsearchdate'
1344 IE_DESC = u'YouTube.com searches, newest videos first'
1345
1346
1347class YoutubeSearchURLIE(InfoExtractor):
1348 IE_DESC = u'YouTube.com search URLs'
1349 IE_NAME = u'youtube:search_url'
1350 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1351
1352 def _real_extract(self, url):
1353 mobj = re.match(self._VALID_URL, url)
1354 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1355
1356 webpage = self._download_webpage(url, query)
1357 result_code = self._search_regex(
1358 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, u'result HTML')
1359
1360 part_codes = re.findall(
1361 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1362 entries = []
1363 for part_code in part_codes:
1364 part_title = self._html_search_regex(
1365 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1366 part_url_snippet = self._html_search_regex(
1367 r'(?s)href="([^"]+)"', part_code, 'item URL')
1368 part_url = compat_urlparse.urljoin(
1369 'https://www.youtube.com/', part_url_snippet)
1370 entries.append({
1371 '_type': 'url',
1372 'url': part_url,
1373 'title': part_title,
1374 })
1375
1376 return {
1377 '_type': 'playlist',
1378 'entries': entries,
1379 'title': query,
1380 }
1381
1382
1383class YoutubeShowIE(InfoExtractor):
1384 IE_DESC = u'YouTube.com (multi-season) shows'
1385 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1386 IE_NAME = u'youtube:show'
1387
1388 def _real_extract(self, url):
1389 mobj = re.match(self._VALID_URL, url)
1390 show_name = mobj.group(1)
1391 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1392 # There's one playlist for each season of the show
1393 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1394 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1395 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1396
1397
1398class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1399 """
1400 Base class for extractors that fetch info from
1401 http://www.youtube.com/feed_ajax
1402 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1403 """
1404 _LOGIN_REQUIRED = True
1405 # use action_load_personal_feed instead of action_load_system_feed
1406 _PERSONAL_FEED = False
1407
1408 @property
1409 def _FEED_TEMPLATE(self):
1410 action = 'action_load_system_feed'
1411 if self._PERSONAL_FEED:
1412 action = 'action_load_personal_feed'
1413 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1414
1415 @property
1416 def IE_NAME(self):
1417 return u'youtube:%s' % self._FEED_NAME
1418
1419 def _real_initialize(self):
1420 self._login()
1421
1422 def _real_extract(self, url):
1423 feed_entries = []
1424 paging = 0
1425 for i in itertools.count(1):
1426 info = self._download_json(self._FEED_TEMPLATE % paging,
1427 u'%s feed' % self._FEED_NAME,
1428 u'Downloading page %s' % i)
1429 feed_html = info.get('feed_html') or info.get('content_html')
1430 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1431 ids = orderedSet(m.group(1) for m in m_ids)
1432 feed_entries.extend(
1433 self.url_result(video_id, 'Youtube', video_id=video_id)
1434 for video_id in ids)
1435 mobj = re.search(
1436 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1437 feed_html)
1438 if mobj is None:
1439 break
1440 paging = mobj.group('paging')
1441 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1442
1443class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1444 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1445 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1446 _FEED_NAME = 'recommended'
1447 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1448
1449class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1450 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1451 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1452 _FEED_NAME = 'watch_later'
1453 _PLAYLIST_TITLE = u'Youtube Watch Later'
1454 _PERSONAL_FEED = True
1455
1456class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1457 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1458 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1459 _FEED_NAME = 'history'
1460 _PERSONAL_FEED = True
1461 _PLAYLIST_TITLE = u'Youtube Watch History'
1462
1463class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1464 IE_NAME = u'youtube:favorites'
1465 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1466 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1467 _LOGIN_REQUIRED = True
1468
1469 def _real_extract(self, url):
1470 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1471 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1472 return self.url_result(playlist_id, 'YoutubePlaylist')
1473
1474
1475class YoutubeSubscriptionsIE(YoutubePlaylistIE):
1476 IE_NAME = u'youtube:subscriptions'
1477 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1478 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1479
1480 def _real_extract(self, url):
1481 title = u'Youtube Subscriptions'
1482 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1483
1484 # The extraction process is the same as for playlists, but the regex
1485 # for the video ids doesn't contain an index
1486 ids = []
1487 more_widget_html = content_html = page
1488
1489 for page_num in itertools.count(1):
1490 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1491 new_ids = orderedSet(matches)
1492 ids.extend(new_ids)
1493
1494 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1495 if not mobj:
1496 break
1497
1498 more = self._download_json(
1499 'https://youtube.com/%s' % mobj.group('more'), title,
1500 'Downloading page #%s' % page_num,
1501 transform_source=uppercase_escape)
1502 content_html = more['content_html']
1503 more_widget_html = more['load_more_widget_html']
1504
1505 return {
1506 '_type': 'playlist',
1507 'title': title,
1508 'entries': self._ids_to_results(ids),
1509 }
1510
1511
1512class YoutubeTruncatedURLIE(InfoExtractor):
1513 IE_NAME = 'youtube:truncated_url'
1514 IE_DESC = False # Do not list
1515 _VALID_URL = r'''(?x)
1516 (?:https?://)?[^/]+/watch\?(?:
1517 feature=[a-z_]+|
1518 annotation_id=annotation_[^&]+
1519 )?$|
1520 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1521 '''
1522
1523 _TESTS = [{
1524 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1525 'only_matching': True,
1526 }, {
1527 'url': 'http://www.youtube.com/watch?',
1528 'only_matching': True,
1529 }]
1530
1531 def _real_extract(self, url):
1532 raise ExtractorError(
1533 u'Did you forget to quote the URL? Remember that & is a meta '
1534 u'character in most shells, so you want to put the URL in quotes, '
1535 u'like youtube-dl '
1536 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1537 u' or simply youtube-dl BaW_jenozKc .',
1538 expected=True)