]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/youtube.py
[mit] Modernize
[yt-dlp.git] / youtube_dl / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5
6 import itertools
7 import json
8 import os.path
9 import re
10 import time
11 import traceback
12
13 from .common import InfoExtractor, SearchInfoExtractor
14 from .subtitles import SubtitlesInfoExtractor
15 from ..jsinterp import JSInterpreter
16 from ..swfinterp import SWFInterpreter
17 from ..compat import (
18 compat_chr,
19 compat_parse_qs,
20 compat_urllib_parse,
21 compat_urllib_request,
22 compat_urlparse,
23 compat_str,
24 )
25 from ..utils import (
26 clean_html,
27 ExtractorError,
28 get_element_by_attribute,
29 get_element_by_id,
30 int_or_none,
31 OnDemandPagedList,
32 orderedSet,
33 unescapeHTML,
34 unified_strdate,
35 uppercase_escape,
36 )
37
38
39 class YoutubeBaseInfoExtractor(InfoExtractor):
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
43 _NETRC_MACHINE = 'youtube'
44 # If True it will raise an error if no login info is provided
45 _LOGIN_REQUIRED = False
46
47 def _set_language(self):
48 self._set_cookie(
49 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
50 # YouTube sets the expire time to about two months
51 expire_time=time.time() + 2 * 30 * 24 * 3600)
52
53 def _login(self):
54 """
55 Attempt to log in to YouTube.
56 True is returned if successful or skipped.
57 False is returned if login failed.
58
59 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
60 """
61 (username, password) = self._get_login_info()
62 # No authentication to be performed
63 if username is None:
64 if self._LOGIN_REQUIRED:
65 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
66 return True
67
68 login_page = self._download_webpage(
69 self._LOGIN_URL, None,
70 note='Downloading login page',
71 errnote='unable to fetch login page', fatal=False)
72 if login_page is False:
73 return
74
75 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
76 login_page, 'Login GALX parameter')
77
78 # Log in
79 login_form_strs = {
80 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
81 'Email': username,
82 'GALX': galx,
83 'Passwd': password,
84
85 'PersistentCookie': 'yes',
86 '_utf8': '霱',
87 'bgresponse': 'js_disabled',
88 'checkConnection': '',
89 'checkedDomains': 'youtube',
90 'dnConn': '',
91 'pstMsg': '0',
92 'rmShown': '1',
93 'secTok': '',
94 'signIn': 'Sign in',
95 'timeStmp': '',
96 'service': 'youtube',
97 'uilel': '3',
98 'hl': 'en_US',
99 }
100
101 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
102 # chokes on unicode
103 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
104 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
105
106 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
107 login_results = self._download_webpage(
108 req, None,
109 note='Logging in', errnote='unable to log in', fatal=False)
110 if login_results is False:
111 return False
112
113 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
114 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
115
116 # Two-Factor
117 # TODO add SMS and phone call support - these require making a request and then prompting the user
118
119 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
120 tfa_code = self._get_tfa_info()
121
122 if tfa_code is None:
123 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
124 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
125 return False
126
127 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
128
129 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
130 if match is None:
131 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
132 secTok = match.group(1)
133 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
134 if match is None:
135 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
136 timeStmp = match.group(1)
137
138 tfa_form_strs = {
139 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
140 'smsToken': '',
141 'smsUserPin': tfa_code,
142 'smsVerifyPin': 'Verify',
143
144 'PersistentCookie': 'yes',
145 'checkConnection': '',
146 'checkedDomains': 'youtube',
147 'pstMsg': '1',
148 'secTok': secTok,
149 'timeStmp': timeStmp,
150 'service': 'youtube',
151 'hl': 'en_US',
152 }
153 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
154 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
155
156 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
157 tfa_results = self._download_webpage(
158 tfa_req, None,
159 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
160
161 if tfa_results is False:
162 return False
163
164 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
165 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
166 return False
167 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
168 self._downloader.report_warning('unable to log in - did the page structure change?')
169 return False
170 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
171 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
172 return False
173
174 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
175 self._downloader.report_warning('unable to log in: bad username or password')
176 return False
177 return True
178
179 def _real_initialize(self):
180 if self._downloader is None:
181 return
182 self._set_language()
183 if not self._login():
184 return
185
186
187 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
188 IE_DESC = 'YouTube.com'
189 _VALID_URL = r"""(?x)^
190 (
191 (?:https?://|//) # http(s):// or protocol-independent URL
192 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
193 (?:www\.)?deturl\.com/www\.youtube\.com/|
194 (?:www\.)?pwnyoutube\.com/|
195 (?:www\.)?yourepeat\.com/|
196 tube\.majestyc\.net/|
197 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
198 (?:.*?\#/)? # handle anchor (#/) redirect urls
199 (?: # the various things that can precede the ID:
200 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
201 |(?: # or the v= param in all its forms
202 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
203 (?:\?|\#!?) # the params delimiter ? or # or #!
204 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
205 v=
206 )
207 ))
208 |youtu\.be/ # just youtu.be/xxxx
209 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
210 )
211 )? # all until now is optional -> you can pass the naked ID
212 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
213 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
214 (?(1).+)? # if we found the ID, everything can follow
215 $"""
216 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
217 _formats = {
218 '5': {'ext': 'flv', 'width': 400, 'height': 240},
219 '6': {'ext': 'flv', 'width': 450, 'height': 270},
220 '13': {'ext': '3gp'},
221 '17': {'ext': '3gp', 'width': 176, 'height': 144},
222 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
223 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
224 '34': {'ext': 'flv', 'width': 640, 'height': 360},
225 '35': {'ext': 'flv', 'width': 854, 'height': 480},
226 '36': {'ext': '3gp', 'width': 320, 'height': 240},
227 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
228 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
229 '43': {'ext': 'webm', 'width': 640, 'height': 360},
230 '44': {'ext': 'webm', 'width': 854, 'height': 480},
231 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
232 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
233
234
235 # 3d videos
236 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
237 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
238 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
239 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
240 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
241 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
242 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
243
244 # Apple HTTP Live Streaming
245 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
246 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
247 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
248 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
249 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
250 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
251 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
252
253 # DASH mp4 video
254 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
255 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
256 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
257 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
258 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
259 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
260 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
261 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
262 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
263 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
264 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
265
266 # Dash mp4 audio
267 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
268 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
269 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
270
271 # Dash webm
272 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
273 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
274 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
275 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
276 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
277 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
278 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
279 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
280 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
281 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
282 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
283 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
284 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
285 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
286 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
287 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
288 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
289 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
290 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
291
292 # Dash webm audio
293 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
294 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
295
296 # Dash webm audio with opus inside
297 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
298 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
299 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
300
301 # RTMP (unnamed)
302 '_rtmp': {'protocol': 'rtmp'},
303 }
304
305 IE_NAME = 'youtube'
306 _TESTS = [
307 {
308 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
309 'info_dict': {
310 'id': 'BaW_jenozKc',
311 'ext': 'mp4',
312 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
313 'uploader': 'Philipp Hagemeister',
314 'uploader_id': 'phihag',
315 'upload_date': '20121002',
316 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
317 'categories': ['Science & Technology'],
318 'like_count': int,
319 'dislike_count': int,
320 }
321 },
322 {
323 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
324 'note': 'Test generic use_cipher_signature video (#897)',
325 'info_dict': {
326 'id': 'UxxajLWwzqY',
327 'ext': 'mp4',
328 'upload_date': '20120506',
329 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
330 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
331 'uploader': 'Icona Pop',
332 'uploader_id': 'IconaPop',
333 }
334 },
335 {
336 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
337 'note': 'Test VEVO video with age protection (#956)',
338 'info_dict': {
339 'id': '07FYdnEawAQ',
340 'ext': 'mp4',
341 'upload_date': '20130703',
342 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
343 'description': 'md5:64249768eec3bc4276236606ea996373',
344 'uploader': 'justintimberlakeVEVO',
345 'uploader_id': 'justintimberlakeVEVO',
346 }
347 },
348 {
349 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
350 'note': 'Embed-only video (#1746)',
351 'info_dict': {
352 'id': 'yZIXLfi8CZQ',
353 'ext': 'mp4',
354 'upload_date': '20120608',
355 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
356 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
357 'uploader': 'SET India',
358 'uploader_id': 'setindia'
359 }
360 },
361 {
362 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
363 'note': '256k DASH audio (format 141) via DASH manifest',
364 'info_dict': {
365 'id': 'a9LDPn-MO4I',
366 'ext': 'm4a',
367 'upload_date': '20121002',
368 'uploader_id': '8KVIDEO',
369 'description': '',
370 'uploader': '8KVIDEO',
371 'title': 'UHDTV TEST 8K VIDEO.mp4'
372 },
373 'params': {
374 'youtube_include_dash_manifest': True,
375 'format': '141',
376 },
377 },
378 # DASH manifest with encrypted signature
379 {
380 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
381 'info_dict': {
382 'id': 'IB3lcPjvWLA',
383 'ext': 'm4a',
384 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
385 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
386 'uploader': 'AfrojackVEVO',
387 'uploader_id': 'AfrojackVEVO',
388 'upload_date': '20131011',
389 },
390 'params': {
391 'youtube_include_dash_manifest': True,
392 'format': '141',
393 },
394 },
395 # Controversy video
396 {
397 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
398 'info_dict': {
399 'id': 'T4XJQO3qol8',
400 'ext': 'mp4',
401 'upload_date': '20100909',
402 'uploader': 'The Amazing Atheist',
403 'uploader_id': 'TheAmazingAtheist',
404 'title': 'Burning Everyone\'s Koran',
405 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
406 }
407 },
408 # Normal age-gate video (No vevo, embed allowed)
409 {
410 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
411 'info_dict': {
412 'id': 'HtVdAasjOgU',
413 'ext': 'mp4',
414 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
415 'description': 'md5:eca57043abae25130f58f655ad9a7771',
416 'uploader': 'The Witcher',
417 'uploader_id': 'WitcherGame',
418 'upload_date': '20140605',
419 },
420 },
421 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
422 {
423 'url': '__2ABJjxzNo',
424 'info_dict': {
425 'id': '__2ABJjxzNo',
426 'ext': 'mp4',
427 'upload_date': '20100430',
428 'uploader_id': 'deadmau5',
429 'description': 'md5:12c56784b8032162bb936a5f76d55360',
430 'uploader': 'deadmau5',
431 'title': 'Deadmau5 - Some Chords (HD)',
432 },
433 'expected_warnings': [
434 'DASH manifest missing',
435 ]
436 },
437 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
438 {
439 'url': 'lqQg6PlCWgI',
440 'info_dict': {
441 'id': 'lqQg6PlCWgI',
442 'ext': 'mp4',
443 'upload_date': '20120731',
444 'uploader_id': 'olympic',
445 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
446 'uploader': 'Olympics',
447 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
448 },
449 'params': {
450 'skip_download': 'requires avconv',
451 }
452 },
453 ]
454
455 def __init__(self, *args, **kwargs):
456 super(YoutubeIE, self).__init__(*args, **kwargs)
457 self._player_cache = {}
458
459 def report_video_info_webpage_download(self, video_id):
460 """Report attempt to download video info webpage."""
461 self.to_screen('%s: Downloading video info webpage' % video_id)
462
463 def report_information_extraction(self, video_id):
464 """Report attempt to extract video information."""
465 self.to_screen('%s: Extracting video information' % video_id)
466
467 def report_unavailable_format(self, video_id, format):
468 """Report extracted video URL."""
469 self.to_screen('%s: Format %s not available' % (video_id, format))
470
471 def report_rtmp_download(self):
472 """Indicate the download will use the RTMP protocol."""
473 self.to_screen('RTMP download detected')
474
475 def _signature_cache_id(self, example_sig):
476 """ Return a string representation of a signature """
477 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
478
479 def _extract_signature_function(self, video_id, player_url, example_sig):
480 id_m = re.match(
481 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
482 player_url)
483 if not id_m:
484 raise ExtractorError('Cannot identify player %r' % player_url)
485 player_type = id_m.group('ext')
486 player_id = id_m.group('id')
487
488 # Read from filesystem cache
489 func_id = '%s_%s_%s' % (
490 player_type, player_id, self._signature_cache_id(example_sig))
491 assert os.path.basename(func_id) == func_id
492
493 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
494 if cache_spec is not None:
495 return lambda s: ''.join(s[i] for i in cache_spec)
496
497 if player_type == 'js':
498 code = self._download_webpage(
499 player_url, video_id,
500 note='Downloading %s player %s' % (player_type, player_id),
501 errnote='Download of %s failed' % player_url)
502 res = self._parse_sig_js(code)
503 elif player_type == 'swf':
504 urlh = self._request_webpage(
505 player_url, video_id,
506 note='Downloading %s player %s' % (player_type, player_id),
507 errnote='Download of %s failed' % player_url)
508 code = urlh.read()
509 res = self._parse_sig_swf(code)
510 else:
511 assert False, 'Invalid player type %r' % player_type
512
513 if cache_spec is None:
514 test_string = ''.join(map(compat_chr, range(len(example_sig))))
515 cache_res = res(test_string)
516 cache_spec = [ord(c) for c in cache_res]
517
518 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
519 return res
520
521 def _print_sig_code(self, func, example_sig):
522 def gen_sig_code(idxs):
523 def _genslice(start, end, step):
524 starts = '' if start == 0 else str(start)
525 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
526 steps = '' if step == 1 else (':%d' % step)
527 return 's[%s%s%s]' % (starts, ends, steps)
528
529 step = None
530 start = '(Never used)' # Quelch pyflakes warnings - start will be
531 # set as soon as step is set
532 for i, prev in zip(idxs[1:], idxs[:-1]):
533 if step is not None:
534 if i - prev == step:
535 continue
536 yield _genslice(start, prev, step)
537 step = None
538 continue
539 if i - prev in [-1, 1]:
540 step = i - prev
541 start = prev
542 continue
543 else:
544 yield 's[%d]' % prev
545 if step is None:
546 yield 's[%d]' % i
547 else:
548 yield _genslice(start, i, step)
549
550 test_string = ''.join(map(compat_chr, range(len(example_sig))))
551 cache_res = func(test_string)
552 cache_spec = [ord(c) for c in cache_res]
553 expr_code = ' + '.join(gen_sig_code(cache_spec))
554 signature_id_tuple = '(%s)' % (
555 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
556 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
557 ' return %s\n') % (signature_id_tuple, expr_code)
558 self.to_screen('Extracted signature function:\n' + code)
559
560 def _parse_sig_js(self, jscode):
561 funcname = self._search_regex(
562 r'\.sig\|\|([a-zA-Z0-9]+)\(', jscode,
563 'Initial JS player signature function name')
564
565 jsi = JSInterpreter(jscode)
566 initial_function = jsi.extract_function(funcname)
567 return lambda s: initial_function([s])
568
569 def _parse_sig_swf(self, file_contents):
570 swfi = SWFInterpreter(file_contents)
571 TARGET_CLASSNAME = 'SignatureDecipher'
572 searched_class = swfi.extract_class(TARGET_CLASSNAME)
573 initial_function = swfi.extract_function(searched_class, 'decipher')
574 return lambda s: initial_function([s])
575
576 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
577 """Turn the encrypted s field into a working signature"""
578
579 if player_url is None:
580 raise ExtractorError('Cannot decrypt signature without player_url')
581
582 if player_url.startswith('//'):
583 player_url = 'https:' + player_url
584 try:
585 player_id = (player_url, self._signature_cache_id(s))
586 if player_id not in self._player_cache:
587 func = self._extract_signature_function(
588 video_id, player_url, s
589 )
590 self._player_cache[player_id] = func
591 func = self._player_cache[player_id]
592 if self._downloader.params.get('youtube_print_sig_code'):
593 self._print_sig_code(func, s)
594 return func(s)
595 except Exception as e:
596 tb = traceback.format_exc()
597 raise ExtractorError(
598 'Signature extraction failed: ' + tb, cause=e)
599
600 def _get_available_subtitles(self, video_id, webpage):
601 try:
602 sub_list = self._download_webpage(
603 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
604 video_id, note=False)
605 except ExtractorError as err:
606 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
607 return {}
608 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
609
610 sub_lang_list = {}
611 for l in lang_list:
612 lang = l[1]
613 if lang in sub_lang_list:
614 continue
615 params = compat_urllib_parse.urlencode({
616 'lang': lang,
617 'v': video_id,
618 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
619 'name': unescapeHTML(l[0]).encode('utf-8'),
620 })
621 url = 'https://www.youtube.com/api/timedtext?' + params
622 sub_lang_list[lang] = url
623 if not sub_lang_list:
624 self._downloader.report_warning('video doesn\'t have subtitles')
625 return {}
626 return sub_lang_list
627
628 def _get_available_automatic_caption(self, video_id, webpage):
629 """We need the webpage for getting the captions url, pass it as an
630 argument to speed up the process."""
631 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
632 self.to_screen('%s: Looking for automatic captions' % video_id)
633 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
634 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
635 if mobj is None:
636 self._downloader.report_warning(err_msg)
637 return {}
638 player_config = json.loads(mobj.group(1))
639 try:
640 args = player_config['args']
641 caption_url = args['ttsurl']
642 timestamp = args['timestamp']
643 # We get the available subtitles
644 list_params = compat_urllib_parse.urlencode({
645 'type': 'list',
646 'tlangs': 1,
647 'asrs': 1,
648 })
649 list_url = caption_url + '&' + list_params
650 caption_list = self._download_xml(list_url, video_id)
651 original_lang_node = caption_list.find('track')
652 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr':
653 self._downloader.report_warning('Video doesn\'t have automatic captions')
654 return {}
655 original_lang = original_lang_node.attrib['lang_code']
656
657 sub_lang_list = {}
658 for lang_node in caption_list.findall('target'):
659 sub_lang = lang_node.attrib['lang_code']
660 params = compat_urllib_parse.urlencode({
661 'lang': original_lang,
662 'tlang': sub_lang,
663 'fmt': sub_format,
664 'ts': timestamp,
665 'kind': 'asr',
666 })
667 sub_lang_list[sub_lang] = caption_url + '&' + params
668 return sub_lang_list
669 # An extractor error can be raise by the download process if there are
670 # no automatic captions but there are subtitles
671 except (KeyError, ExtractorError):
672 self._downloader.report_warning(err_msg)
673 return {}
674
675 @classmethod
676 def extract_id(cls, url):
677 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
678 if mobj is None:
679 raise ExtractorError('Invalid URL: %s' % url)
680 video_id = mobj.group(2)
681 return video_id
682
683 def _extract_from_m3u8(self, manifest_url, video_id):
684 url_map = {}
685
686 def _get_urls(_manifest):
687 lines = _manifest.split('\n')
688 urls = filter(lambda l: l and not l.startswith('#'),
689 lines)
690 return urls
691 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
692 formats_urls = _get_urls(manifest)
693 for format_url in formats_urls:
694 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
695 url_map[itag] = format_url
696 return url_map
697
698 def _extract_annotations(self, video_id):
699 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
700 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
701
702 def _parse_dash_manifest(
703 self, video_id, dash_manifest_url, player_url, age_gate):
704 def decrypt_sig(mobj):
705 s = mobj.group(1)
706 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
707 return '/signature/%s' % dec_s
708 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
709 dash_doc = self._download_xml(
710 dash_manifest_url, video_id,
711 note='Downloading DASH manifest',
712 errnote='Could not download DASH manifest')
713
714 formats = []
715 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
716 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
717 if url_el is None:
718 continue
719 format_id = r.attrib['id']
720 video_url = url_el.text
721 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
722 f = {
723 'format_id': format_id,
724 'url': video_url,
725 'width': int_or_none(r.attrib.get('width')),
726 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
727 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
728 'filesize': filesize,
729 'fps': int_or_none(r.attrib.get('frameRate')),
730 }
731 try:
732 existing_format = next(
733 fo for fo in formats
734 if fo['format_id'] == format_id)
735 except StopIteration:
736 f.update(self._formats.get(format_id, {}))
737 formats.append(f)
738 else:
739 existing_format.update(f)
740 return formats
741
742 def _real_extract(self, url):
743 proto = (
744 'http' if self._downloader.params.get('prefer_insecure', False)
745 else 'https')
746
747 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
748 mobj = re.search(self._NEXT_URL_RE, url)
749 if mobj:
750 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
751 video_id = self.extract_id(url)
752
753 # Get video webpage
754 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
755 video_webpage = self._download_webpage(url, video_id)
756
757 # Attempt to extract SWF player URL
758 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
759 if mobj is not None:
760 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
761 else:
762 player_url = None
763
764 # Get video info
765 if re.search(r'player-age-gate-content">', video_webpage) is not None:
766 age_gate = True
767 # We simulate the access to the video from www.youtube.com/v/{video_id}
768 # this can be viewed without login into Youtube
769 data = compat_urllib_parse.urlencode({
770 'video_id': video_id,
771 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
772 'sts': self._search_regex(
773 r'"sts"\s*:\s*(\d+)', video_webpage, 'sts', default=''),
774 })
775 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
776 video_info_webpage = self._download_webpage(
777 video_info_url, video_id,
778 note='Refetching age-gated info webpage',
779 errnote='unable to download video info webpage')
780 video_info = compat_parse_qs(video_info_webpage)
781 else:
782 age_gate = False
783 try:
784 # Try looking directly into the video webpage
785 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
786 if not mobj:
787 raise ValueError('Could not find ytplayer.config') # caught below
788 json_code = uppercase_escape(mobj.group(1))
789 ytplayer_config = json.loads(json_code)
790 args = ytplayer_config['args']
791 # Convert to the same format returned by compat_parse_qs
792 video_info = dict((k, [v]) for k, v in args.items())
793 if 'url_encoded_fmt_stream_map' not in args:
794 raise ValueError('No stream_map present') # caught below
795 except ValueError:
796 # We fallback to the get_video_info pages (used by the embed page)
797 self.report_video_info_webpage_download(video_id)
798 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
799 video_info_url = (
800 '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
801 % (proto, video_id, el_type))
802 video_info_webpage = self._download_webpage(
803 video_info_url,
804 video_id, note=False,
805 errnote='unable to download video info webpage')
806 video_info = compat_parse_qs(video_info_webpage)
807 if 'token' in video_info:
808 break
809 if 'token' not in video_info:
810 if 'reason' in video_info:
811 raise ExtractorError(
812 'YouTube said: %s' % video_info['reason'][0],
813 expected=True, video_id=video_id)
814 else:
815 raise ExtractorError(
816 '"token" parameter not in video info for unknown reason',
817 video_id=video_id)
818
819 if 'view_count' in video_info:
820 view_count = int(video_info['view_count'][0])
821 else:
822 view_count = None
823
824 # Check for "rental" videos
825 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
826 raise ExtractorError('"rental" videos not supported')
827
828 # Start extracting information
829 self.report_information_extraction(video_id)
830
831 # uploader
832 if 'author' not in video_info:
833 raise ExtractorError('Unable to extract uploader name')
834 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
835
836 # uploader_id
837 video_uploader_id = None
838 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
839 if mobj is not None:
840 video_uploader_id = mobj.group(1)
841 else:
842 self._downloader.report_warning('unable to extract uploader nickname')
843
844 # title
845 if 'title' in video_info:
846 video_title = video_info['title'][0]
847 else:
848 self._downloader.report_warning('Unable to extract video title')
849 video_title = '_'
850
851 # thumbnail image
852 # We try first to get a high quality image:
853 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
854 video_webpage, re.DOTALL)
855 if m_thumb is not None:
856 video_thumbnail = m_thumb.group(1)
857 elif 'thumbnail_url' not in video_info:
858 self._downloader.report_warning('unable to extract video thumbnail')
859 video_thumbnail = None
860 else: # don't panic if we can't find it
861 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
862
863 # upload date
864 upload_date = None
865 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
866 if mobj is None:
867 mobj = re.search(
868 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
869 video_webpage)
870 if mobj is not None:
871 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
872 upload_date = unified_strdate(upload_date)
873
874 m_cat_container = self._search_regex(
875 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
876 video_webpage, 'categories', default=None)
877 if m_cat_container:
878 category = self._html_search_regex(
879 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
880 default=None)
881 video_categories = None if category is None else [category]
882 else:
883 video_categories = None
884
885 # description
886 video_description = get_element_by_id("eow-description", video_webpage)
887 if video_description:
888 video_description = re.sub(r'''(?x)
889 <a\s+
890 (?:[a-zA-Z-]+="[^"]+"\s+)*?
891 title="([^"]+)"\s+
892 (?:[a-zA-Z-]+="[^"]+"\s+)*?
893 class="yt-uix-redirect-link"\s*>
894 [^<]+
895 </a>
896 ''', r'\1', video_description)
897 video_description = clean_html(video_description)
898 else:
899 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
900 if fd_mobj:
901 video_description = unescapeHTML(fd_mobj.group(1))
902 else:
903 video_description = ''
904
905 def _extract_count(count_name):
906 count = self._search_regex(
907 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
908 video_webpage, count_name, default=None)
909 if count is not None:
910 return int(count.replace(',', ''))
911 return None
912 like_count = _extract_count('like')
913 dislike_count = _extract_count('dislike')
914
915 # subtitles
916 video_subtitles = self.extract_subtitles(video_id, video_webpage)
917
918 if self._downloader.params.get('listsubtitles', False):
919 self._list_available_subtitles(video_id, video_webpage)
920 return
921
922 if 'length_seconds' not in video_info:
923 self._downloader.report_warning('unable to extract video duration')
924 video_duration = None
925 else:
926 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
927
928 # annotations
929 video_annotations = None
930 if self._downloader.params.get('writeannotations', False):
931 video_annotations = self._extract_annotations(video_id)
932
933 def _map_to_format_list(urlmap):
934 formats = []
935 for itag, video_real_url in urlmap.items():
936 dct = {
937 'format_id': itag,
938 'url': video_real_url,
939 'player_url': player_url,
940 }
941 if itag in self._formats:
942 dct.update(self._formats[itag])
943 formats.append(dct)
944 return formats
945
946 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
947 self.report_rtmp_download()
948 formats = [{
949 'format_id': '_rtmp',
950 'protocol': 'rtmp',
951 'url': video_info['conn'][0],
952 'player_url': player_url,
953 }]
954 elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
955 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
956 if 'rtmpe%3Dyes' in encoded_url_map:
957 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
958 url_map = {}
959 for url_data_str in encoded_url_map.split(','):
960 url_data = compat_parse_qs(url_data_str)
961 if 'itag' not in url_data or 'url' not in url_data:
962 continue
963 format_id = url_data['itag'][0]
964 url = url_data['url'][0]
965
966 if 'sig' in url_data:
967 url += '&signature=' + url_data['sig'][0]
968 elif 's' in url_data:
969 encrypted_sig = url_data['s'][0]
970
971 if not age_gate:
972 jsplayer_url_json = self._search_regex(
973 r'"assets":.+?"js":\s*("[^"]+")',
974 video_webpage, 'JS player URL')
975 player_url = json.loads(jsplayer_url_json)
976 if player_url is None:
977 player_url_json = self._search_regex(
978 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
979 video_webpage, 'age gate player URL')
980 player_url = json.loads(player_url_json)
981
982 if self._downloader.params.get('verbose'):
983 if player_url is None:
984 player_version = 'unknown'
985 player_desc = 'unknown'
986 else:
987 if player_url.endswith('swf'):
988 player_version = self._search_regex(
989 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
990 'flash player', fatal=False)
991 player_desc = 'flash player %s' % player_version
992 else:
993 player_version = self._search_regex(
994 r'html5player-([^/]+?)(?:/html5player)?\.js',
995 player_url,
996 'html5 player', fatal=False)
997 player_desc = 'html5 player %s' % player_version
998
999 parts_sizes = self._signature_cache_id(encrypted_sig)
1000 self.to_screen('{%s} signature length %s, %s' %
1001 (format_id, parts_sizes, player_desc))
1002
1003 signature = self._decrypt_signature(
1004 encrypted_sig, video_id, player_url, age_gate)
1005 url += '&signature=' + signature
1006 if 'ratebypass' not in url:
1007 url += '&ratebypass=yes'
1008 url_map[format_id] = url
1009 formats = _map_to_format_list(url_map)
1010 elif video_info.get('hlsvp'):
1011 manifest_url = video_info['hlsvp'][0]
1012 url_map = self._extract_from_m3u8(manifest_url, video_id)
1013 formats = _map_to_format_list(url_map)
1014 else:
1015 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1016
1017 # Look for the DASH manifest
1018 if self._downloader.params.get('youtube_include_dash_manifest', True):
1019 dash_mpd = video_info.get('dashmpd')
1020 if dash_mpd:
1021 dash_manifest_url = dash_mpd[0]
1022 try:
1023 dash_formats = self._parse_dash_manifest(
1024 video_id, dash_manifest_url, player_url, age_gate)
1025 except (ExtractorError, KeyError) as e:
1026 self.report_warning(
1027 'Skipping DASH manifest: %r' % e, video_id)
1028 else:
1029 formats.extend(dash_formats)
1030
1031 self._sort_formats(formats)
1032
1033 return {
1034 'id': video_id,
1035 'uploader': video_uploader,
1036 'uploader_id': video_uploader_id,
1037 'upload_date': upload_date,
1038 'title': video_title,
1039 'thumbnail': video_thumbnail,
1040 'description': video_description,
1041 'categories': video_categories,
1042 'subtitles': video_subtitles,
1043 'duration': video_duration,
1044 'age_limit': 18 if age_gate else 0,
1045 'annotations': video_annotations,
1046 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1047 'view_count': view_count,
1048 'like_count': like_count,
1049 'dislike_count': dislike_count,
1050 'formats': formats,
1051 }
1052
1053
1054 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1055 IE_DESC = 'YouTube.com playlists'
1056 _VALID_URL = r"""(?x)(?:
1057 (?:https?://)?
1058 (?:\w+\.)?
1059 youtube\.com/
1060 (?:
1061 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
1062 \? (?:.*?&)*? (?:p|a|list)=
1063 | p/
1064 )
1065 (
1066 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1067 # Top tracks, they can also include dots
1068 |(?:MC)[\w\.]*
1069 )
1070 .*
1071 |
1072 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1073 )"""
1074 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1075 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1076 IE_NAME = 'youtube:playlist'
1077 _TESTS = [{
1078 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1079 'info_dict': {
1080 'title': 'ytdl test PL',
1081 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1082 },
1083 'playlist_count': 3,
1084 }, {
1085 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1086 'info_dict': {
1087 'title': 'YDL_Empty_List',
1088 },
1089 'playlist_count': 0,
1090 }, {
1091 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1092 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1093 'info_dict': {
1094 'title': '29C3: Not my department',
1095 },
1096 'playlist_count': 95,
1097 }, {
1098 'note': 'issue #673',
1099 'url': 'PLBB231211A4F62143',
1100 'info_dict': {
1101 'title': '[OLD]Team Fortress 2 (Class-based LP)',
1102 },
1103 'playlist_mincount': 26,
1104 }, {
1105 'note': 'Large playlist',
1106 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1107 'info_dict': {
1108 'title': 'Uploads from Cauchemar',
1109 },
1110 'playlist_mincount': 799,
1111 }, {
1112 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1113 'info_dict': {
1114 'title': 'YDL_safe_search',
1115 },
1116 'playlist_count': 2,
1117 }, {
1118 'note': 'embedded',
1119 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1120 'playlist_count': 4,
1121 'info_dict': {
1122 'title': 'JODA15',
1123 }
1124 }, {
1125 'note': 'Embedded SWF player',
1126 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1127 'playlist_count': 4,
1128 'info_dict': {
1129 'title': 'JODA7',
1130 }
1131 }, {
1132 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
1133 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
1134 'info_dict': {
1135 'title': 'Uploads from Interstellar Movie',
1136 },
1137 'playlist_mincout': 21,
1138 }]
1139
1140 def _real_initialize(self):
1141 self._login()
1142
1143 def _ids_to_results(self, ids):
1144 return [
1145 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1146 for vid_id in ids]
1147
1148 def _extract_mix(self, playlist_id):
1149 # The mixes are generated from a a single video
1150 # the id of the playlist is just 'RD' + video_id
1151 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1152 webpage = self._download_webpage(
1153 url, playlist_id, 'Downloading Youtube mix')
1154 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1155 title_span = (
1156 search_title('playlist-title') or
1157 search_title('title long-title') or
1158 search_title('title'))
1159 title = clean_html(title_span)
1160 ids = orderedSet(re.findall(
1161 r'''(?xs)data-video-username=".*?".*?
1162 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1163 webpage))
1164 url_results = self._ids_to_results(ids)
1165
1166 return self.playlist_result(url_results, playlist_id, title)
1167
1168 def _real_extract(self, url):
1169 # Extract playlist id
1170 mobj = re.match(self._VALID_URL, url)
1171 if mobj is None:
1172 raise ExtractorError('Invalid URL: %s' % url)
1173 playlist_id = mobj.group(1) or mobj.group(2)
1174
1175 # Check if it's a video-specific URL
1176 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1177 if 'v' in query_dict:
1178 video_id = query_dict['v'][0]
1179 if self._downloader.params.get('noplaylist'):
1180 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1181 return self.url_result(video_id, 'Youtube', video_id=video_id)
1182 else:
1183 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1184
1185 if playlist_id.startswith('RD'):
1186 # Mixes require a custom extraction process
1187 return self._extract_mix(playlist_id)
1188 if playlist_id.startswith('TL'):
1189 raise ExtractorError('For downloading YouTube.com top lists, use '
1190 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1191
1192 url = self._TEMPLATE_URL % playlist_id
1193 page = self._download_webpage(url, playlist_id)
1194 more_widget_html = content_html = page
1195
1196 # Check if the playlist exists or is private
1197 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1198 raise ExtractorError(
1199 'The playlist doesn\'t exist or is private, use --username or '
1200 '--netrc to access it.',
1201 expected=True)
1202
1203 # Extract the video ids from the playlist pages
1204 ids = []
1205
1206 for page_num in itertools.count(1):
1207 matches = re.finditer(self._VIDEO_RE, content_html)
1208 # We remove the duplicates and the link with index 0
1209 # (it's not the first video of the playlist)
1210 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1211 ids.extend(new_ids)
1212
1213 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1214 if not mobj:
1215 break
1216
1217 more = self._download_json(
1218 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1219 'Downloading page #%s' % page_num,
1220 transform_source=uppercase_escape)
1221 content_html = more['content_html']
1222 if not content_html.strip():
1223 # Some webpages show a "Load more" button but they don't
1224 # have more videos
1225 break
1226 more_widget_html = more['load_more_widget_html']
1227
1228 playlist_title = self._html_search_regex(
1229 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1230 page, 'title')
1231
1232 url_results = self._ids_to_results(ids)
1233 return self.playlist_result(url_results, playlist_id, playlist_title)
1234
1235
1236 class YoutubeTopListIE(YoutubePlaylistIE):
1237 IE_NAME = 'youtube:toplist'
1238 IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1239 ' (Example: "yttoplist:music:Top Tracks")')
1240 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1241 _TESTS = [{
1242 'url': 'yttoplist:music:Trending',
1243 'playlist_mincount': 5,
1244 'skip': 'Only works for logged-in users',
1245 }]
1246
1247 def _real_extract(self, url):
1248 mobj = re.match(self._VALID_URL, url)
1249 channel = mobj.group('chann')
1250 title = mobj.group('title')
1251 query = compat_urllib_parse.urlencode({'title': title})
1252 channel_page = self._download_webpage(
1253 'https://www.youtube.com/%s' % channel, title)
1254 link = self._html_search_regex(
1255 r'''(?x)
1256 <a\s+href="([^"]+)".*?>\s*
1257 <span\s+class="branded-page-module-title-text">\s*
1258 <span[^>]*>.*?%s.*?</span>''' % re.escape(query),
1259 channel_page, 'list')
1260 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1261
1262 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1263 ids = []
1264 # sometimes the webpage doesn't contain the videos
1265 # retry until we get them
1266 for i in itertools.count(0):
1267 msg = 'Downloading Youtube mix'
1268 if i > 0:
1269 msg += ', retry #%d' % i
1270
1271 webpage = self._download_webpage(url, title, msg)
1272 ids = orderedSet(re.findall(video_re, webpage))
1273 if ids:
1274 break
1275 url_results = self._ids_to_results(ids)
1276 return self.playlist_result(url_results, playlist_title=title)
1277
1278
1279 class YoutubeChannelIE(InfoExtractor):
1280 IE_DESC = 'YouTube.com channels'
1281 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
1282 IE_NAME = 'youtube:channel'
1283 _TESTS = [{
1284 'note': 'paginated channel',
1285 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1286 'playlist_mincount': 91,
1287 }]
1288
1289 def extract_videos_from_page(self, page):
1290 ids_in_page = []
1291 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1292 if mobj.group(1) not in ids_in_page:
1293 ids_in_page.append(mobj.group(1))
1294 return ids_in_page
1295
1296 def _real_extract(self, url):
1297 channel_id = self._match_id(url)
1298
1299 video_ids = []
1300 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1301 channel_page = self._download_webpage(url, channel_id)
1302 autogenerated = re.search(r'''(?x)
1303 class="[^"]*?(?:
1304 channel-header-autogenerated-label|
1305 yt-channel-title-autogenerated
1306 )[^"]*"''', channel_page) is not None
1307
1308 if autogenerated:
1309 # The videos are contained in a single page
1310 # the ajax pages can't be used, they are empty
1311 video_ids = self.extract_videos_from_page(channel_page)
1312 entries = [
1313 self.url_result(video_id, 'Youtube', video_id=video_id)
1314 for video_id in video_ids]
1315 return self.playlist_result(entries, channel_id)
1316
1317 def _entries():
1318 more_widget_html = content_html = channel_page
1319 for pagenum in itertools.count(1):
1320
1321 ids_in_page = self.extract_videos_from_page(content_html)
1322 for video_id in ids_in_page:
1323 yield self.url_result(
1324 video_id, 'Youtube', video_id=video_id)
1325
1326 mobj = re.search(
1327 r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
1328 more_widget_html)
1329 if not mobj:
1330 break
1331
1332 more = self._download_json(
1333 'https://youtube.com/%s' % mobj.group('more'), channel_id,
1334 'Downloading page #%s' % (pagenum + 1),
1335 transform_source=uppercase_escape)
1336 content_html = more['content_html']
1337 more_widget_html = more['load_more_widget_html']
1338
1339 return self.playlist_result(_entries(), channel_id)
1340
1341
1342 class YoutubeUserIE(InfoExtractor):
1343 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
1344 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
1345 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1346 _GDATA_PAGE_SIZE = 50
1347 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1348 IE_NAME = 'youtube:user'
1349
1350 _TESTS = [{
1351 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1352 'playlist_mincount': 320,
1353 'info_dict': {
1354 'title': 'TheLinuxFoundation',
1355 }
1356 }, {
1357 'url': 'ytuser:phihag',
1358 'only_matching': True,
1359 }]
1360
1361 @classmethod
1362 def suitable(cls, url):
1363 # Don't return True if the url can be extracted with other youtube
1364 # extractor, the regex would is too permissive and it would match.
1365 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1366 if any(ie.suitable(url) for ie in other_ies):
1367 return False
1368 else:
1369 return super(YoutubeUserIE, cls).suitable(url)
1370
1371 def _real_extract(self, url):
1372 username = self._match_id(url)
1373
1374 # Download video ids using YouTube Data API. Result size per
1375 # query is limited (currently to 50 videos) so we need to query
1376 # page by page until there are no video ids - it means we got
1377 # all of them.
1378
1379 def download_page(pagenum):
1380 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1381
1382 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1383 page = self._download_webpage(
1384 gdata_url, username,
1385 'Downloading video ids from %d to %d' % (
1386 start_index, start_index + self._GDATA_PAGE_SIZE))
1387
1388 try:
1389 response = json.loads(page)
1390 except ValueError as err:
1391 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
1392 if 'entry' not in response['feed']:
1393 return
1394
1395 # Extract video identifiers
1396 entries = response['feed']['entry']
1397 for entry in entries:
1398 title = entry['title']['$t']
1399 video_id = entry['id']['$t'].split('/')[-1]
1400 yield {
1401 '_type': 'url',
1402 'url': video_id,
1403 'ie_key': 'Youtube',
1404 'id': video_id,
1405 'title': title,
1406 }
1407 url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
1408
1409 return self.playlist_result(url_results, playlist_title=username)
1410
1411
1412 class YoutubeSearchIE(SearchInfoExtractor):
1413 IE_DESC = 'YouTube.com searches'
1414 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1415 _MAX_RESULTS = 1000
1416 IE_NAME = 'youtube:search'
1417 _SEARCH_KEY = 'ytsearch'
1418
1419 def _get_n_results(self, query, n):
1420 """Get a specified number of results for a query"""
1421
1422 video_ids = []
1423 pagenum = 0
1424 limit = n
1425 PAGE_SIZE = 50
1426
1427 while (PAGE_SIZE * pagenum) < limit:
1428 result_url = self._API_URL % (
1429 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1430 (PAGE_SIZE * pagenum) + 1)
1431 data_json = self._download_webpage(
1432 result_url, video_id='query "%s"' % query,
1433 note='Downloading page %s' % (pagenum + 1),
1434 errnote='Unable to download API page')
1435 data = json.loads(data_json)
1436 api_response = data['data']
1437
1438 if 'items' not in api_response:
1439 raise ExtractorError(
1440 '[youtube] No video results', expected=True)
1441
1442 new_ids = list(video['id'] for video in api_response['items'])
1443 video_ids += new_ids
1444
1445 limit = min(n, api_response['totalItems'])
1446 pagenum += 1
1447
1448 if len(video_ids) > n:
1449 video_ids = video_ids[:n]
1450 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1451 for video_id in video_ids]
1452 return self.playlist_result(videos, query)
1453
1454
1455 class YoutubeSearchDateIE(YoutubeSearchIE):
1456 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1457 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1458 _SEARCH_KEY = 'ytsearchdate'
1459 IE_DESC = 'YouTube.com searches, newest videos first'
1460
1461
1462 class YoutubeSearchURLIE(InfoExtractor):
1463 IE_DESC = 'YouTube.com search URLs'
1464 IE_NAME = 'youtube:search_url'
1465 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1466 _TESTS = [{
1467 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1468 'playlist_mincount': 5,
1469 'info_dict': {
1470 'title': 'youtube-dl test video',
1471 }
1472 }]
1473
1474 def _real_extract(self, url):
1475 mobj = re.match(self._VALID_URL, url)
1476 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1477
1478 webpage = self._download_webpage(url, query)
1479 result_code = self._search_regex(
1480 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
1481
1482 part_codes = re.findall(
1483 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1484 entries = []
1485 for part_code in part_codes:
1486 part_title = self._html_search_regex(
1487 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1488 part_url_snippet = self._html_search_regex(
1489 r'(?s)href="([^"]+)"', part_code, 'item URL')
1490 part_url = compat_urlparse.urljoin(
1491 'https://www.youtube.com/', part_url_snippet)
1492 entries.append({
1493 '_type': 'url',
1494 'url': part_url,
1495 'title': part_title,
1496 })
1497
1498 return {
1499 '_type': 'playlist',
1500 'entries': entries,
1501 'title': query,
1502 }
1503
1504
1505 class YoutubeShowIE(InfoExtractor):
1506 IE_DESC = 'YouTube.com (multi-season) shows'
1507 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
1508 IE_NAME = 'youtube:show'
1509 _TESTS = [{
1510 'url': 'http://www.youtube.com/show/airdisasters',
1511 'playlist_mincount': 3,
1512 'info_dict': {
1513 'id': 'airdisasters',
1514 'title': 'Air Disasters',
1515 }
1516 }]
1517
1518 def _real_extract(self, url):
1519 mobj = re.match(self._VALID_URL, url)
1520 playlist_id = mobj.group('id')
1521 webpage = self._download_webpage(
1522 url, playlist_id, 'Downloading show webpage')
1523 # There's one playlist for each season of the show
1524 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1525 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1526 entries = [
1527 self.url_result(
1528 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1529 for season in m_seasons
1530 ]
1531 title = self._og_search_title(webpage, fatal=False)
1532
1533 return {
1534 '_type': 'playlist',
1535 'id': playlist_id,
1536 'title': title,
1537 'entries': entries,
1538 }
1539
1540
1541 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1542 """
1543 Base class for extractors that fetch info from
1544 http://www.youtube.com/feed_ajax
1545 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1546 """
1547 _LOGIN_REQUIRED = True
1548 # use action_load_personal_feed instead of action_load_system_feed
1549 _PERSONAL_FEED = False
1550
1551 @property
1552 def _FEED_TEMPLATE(self):
1553 action = 'action_load_system_feed'
1554 if self._PERSONAL_FEED:
1555 action = 'action_load_personal_feed'
1556 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1557
1558 @property
1559 def IE_NAME(self):
1560 return 'youtube:%s' % self._FEED_NAME
1561
1562 def _real_initialize(self):
1563 self._login()
1564
1565 def _real_extract(self, url):
1566 feed_entries = []
1567 paging = 0
1568 for i in itertools.count(1):
1569 info = self._download_json(
1570 self._FEED_TEMPLATE % paging,
1571 '%s feed' % self._FEED_NAME,
1572 'Downloading page %s' % i,
1573 transform_source=uppercase_escape)
1574 feed_html = info.get('feed_html') or info.get('content_html')
1575 load_more_widget_html = info.get('load_more_widget_html') or feed_html
1576 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1577 ids = orderedSet(m.group(1) for m in m_ids)
1578 feed_entries.extend(
1579 self.url_result(video_id, 'Youtube', video_id=video_id)
1580 for video_id in ids)
1581 mobj = re.search(
1582 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1583 load_more_widget_html)
1584 if mobj is None:
1585 break
1586 paging = mobj.group('paging')
1587 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1588
1589
1590 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1591 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
1592 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1593 _FEED_NAME = 'recommended'
1594 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1595
1596
1597 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1598 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
1599 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1600 _FEED_NAME = 'watch_later'
1601 _PLAYLIST_TITLE = 'Youtube Watch Later'
1602 _PERSONAL_FEED = True
1603
1604
1605 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1606 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
1607 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1608 _FEED_NAME = 'history'
1609 _PERSONAL_FEED = True
1610 _PLAYLIST_TITLE = 'Youtube Watch History'
1611
1612
1613 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1614 IE_NAME = 'youtube:favorites'
1615 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
1616 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1617 _LOGIN_REQUIRED = True
1618
1619 def _real_extract(self, url):
1620 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1621 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
1622 return self.url_result(playlist_id, 'YoutubePlaylist')
1623
1624
1625 class YoutubeSubscriptionsIE(YoutubePlaylistIE):
1626 IE_NAME = 'youtube:subscriptions'
1627 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1628 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1629 _TESTS = []
1630
1631 def _real_extract(self, url):
1632 title = 'Youtube Subscriptions'
1633 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1634
1635 # The extraction process is the same as for playlists, but the regex
1636 # for the video ids doesn't contain an index
1637 ids = []
1638 more_widget_html = content_html = page
1639
1640 for page_num in itertools.count(1):
1641 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1642 new_ids = orderedSet(matches)
1643 ids.extend(new_ids)
1644
1645 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1646 if not mobj:
1647 break
1648
1649 more = self._download_json(
1650 'https://youtube.com/%s' % mobj.group('more'), title,
1651 'Downloading page #%s' % page_num,
1652 transform_source=uppercase_escape)
1653 content_html = more['content_html']
1654 more_widget_html = more['load_more_widget_html']
1655
1656 return {
1657 '_type': 'playlist',
1658 'title': title,
1659 'entries': self._ids_to_results(ids),
1660 }
1661
1662
1663 class YoutubeTruncatedURLIE(InfoExtractor):
1664 IE_NAME = 'youtube:truncated_url'
1665 IE_DESC = False # Do not list
1666 _VALID_URL = r'''(?x)
1667 (?:https?://)?[^/]+/watch\?(?:
1668 feature=[a-z_]+|
1669 annotation_id=annotation_[^&]+
1670 )?$|
1671 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1672 '''
1673
1674 _TESTS = [{
1675 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1676 'only_matching': True,
1677 }, {
1678 'url': 'http://www.youtube.com/watch?',
1679 'only_matching': True,
1680 }]
1681
1682 def _real_extract(self, url):
1683 raise ExtractorError(
1684 'Did you forget to quote the URL? Remember that & is a meta '
1685 'character in most shells, so you want to put the URL in quotes, '
1686 'like youtube-dl '
1687 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1688 ' or simply youtube-dl BaW_jenozKc .',
1689 expected=True)