]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/youtube.py
[youtube] Add test case for #4431
[yt-dlp.git] / youtube_dl / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5
6 import itertools
7 import json
8 import os.path
9 import re
10 import time
11 import traceback
12
13 from .common import InfoExtractor, SearchInfoExtractor
14 from .subtitles import SubtitlesInfoExtractor
15 from ..jsinterp import JSInterpreter
16 from ..swfinterp import SWFInterpreter
17 from ..compat import (
18 compat_chr,
19 compat_parse_qs,
20 compat_urllib_parse,
21 compat_urllib_request,
22 compat_urlparse,
23 compat_str,
24 )
25 from ..utils import (
26 clean_html,
27 ExtractorError,
28 get_element_by_attribute,
29 get_element_by_id,
30 int_or_none,
31 OnDemandPagedList,
32 orderedSet,
33 unescapeHTML,
34 unified_strdate,
35 uppercase_escape,
36 )
37
38
39 class YoutubeBaseInfoExtractor(InfoExtractor):
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
43 _NETRC_MACHINE = 'youtube'
44 # If True it will raise an error if no login info is provided
45 _LOGIN_REQUIRED = False
46
47 def _set_language(self):
48 self._set_cookie(
49 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
50 # YouTube sets the expire time to about two months
51 expire_time=time.time() + 2 * 30 * 24 * 3600)
52
53 def _login(self):
54 """
55 Attempt to log in to YouTube.
56 True is returned if successful or skipped.
57 False is returned if login failed.
58
59 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
60 """
61 (username, password) = self._get_login_info()
62 # No authentication to be performed
63 if username is None:
64 if self._LOGIN_REQUIRED:
65 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
66 return True
67
68 login_page = self._download_webpage(
69 self._LOGIN_URL, None,
70 note='Downloading login page',
71 errnote='unable to fetch login page', fatal=False)
72 if login_page is False:
73 return
74
75 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
76 login_page, 'Login GALX parameter')
77
78 # Log in
79 login_form_strs = {
80 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
81 'Email': username,
82 'GALX': galx,
83 'Passwd': password,
84
85 'PersistentCookie': 'yes',
86 '_utf8': '霱',
87 'bgresponse': 'js_disabled',
88 'checkConnection': '',
89 'checkedDomains': 'youtube',
90 'dnConn': '',
91 'pstMsg': '0',
92 'rmShown': '1',
93 'secTok': '',
94 'signIn': 'Sign in',
95 'timeStmp': '',
96 'service': 'youtube',
97 'uilel': '3',
98 'hl': 'en_US',
99 }
100
101 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
102 # chokes on unicode
103 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
104 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
105
106 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
107 login_results = self._download_webpage(
108 req, None,
109 note='Logging in', errnote='unable to log in', fatal=False)
110 if login_results is False:
111 return False
112
113 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
114 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
115
116 # Two-Factor
117 # TODO add SMS and phone call support - these require making a request and then prompting the user
118
119 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
120 tfa_code = self._get_tfa_info()
121
122 if tfa_code is None:
123 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
124 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
125 return False
126
127 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
128
129 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
130 if match is None:
131 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
132 secTok = match.group(1)
133 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
134 if match is None:
135 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
136 timeStmp = match.group(1)
137
138 tfa_form_strs = {
139 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
140 'smsToken': '',
141 'smsUserPin': tfa_code,
142 'smsVerifyPin': 'Verify',
143
144 'PersistentCookie': 'yes',
145 'checkConnection': '',
146 'checkedDomains': 'youtube',
147 'pstMsg': '1',
148 'secTok': secTok,
149 'timeStmp': timeStmp,
150 'service': 'youtube',
151 'hl': 'en_US',
152 }
153 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
154 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
155
156 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
157 tfa_results = self._download_webpage(
158 tfa_req, None,
159 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
160
161 if tfa_results is False:
162 return False
163
164 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
165 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
166 return False
167 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
168 self._downloader.report_warning('unable to log in - did the page structure change?')
169 return False
170 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
171 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
172 return False
173
174 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
175 self._downloader.report_warning('unable to log in: bad username or password')
176 return False
177 return True
178
179 def _real_initialize(self):
180 if self._downloader is None:
181 return
182 self._set_language()
183 if not self._login():
184 return
185
186
187 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
188 IE_DESC = 'YouTube.com'
189 _VALID_URL = r"""(?x)^
190 (
191 (?:https?://|//) # http(s):// or protocol-independent URL
192 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
193 (?:www\.)?deturl\.com/www\.youtube\.com/|
194 (?:www\.)?pwnyoutube\.com/|
195 (?:www\.)?yourepeat\.com/|
196 tube\.majestyc\.net/|
197 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
198 (?:.*?\#/)? # handle anchor (#/) redirect urls
199 (?: # the various things that can precede the ID:
200 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
201 |(?: # or the v= param in all its forms
202 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
203 (?:\?|\#!?) # the params delimiter ? or # or #!
204 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
205 v=
206 )
207 ))
208 |youtu\.be/ # just youtu.be/xxxx
209 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
210 )
211 )? # all until now is optional -> you can pass the naked ID
212 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
213 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
214 (?(1).+)? # if we found the ID, everything can follow
215 $"""
216 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
217 _formats = {
218 '5': {'ext': 'flv', 'width': 400, 'height': 240},
219 '6': {'ext': 'flv', 'width': 450, 'height': 270},
220 '13': {'ext': '3gp'},
221 '17': {'ext': '3gp', 'width': 176, 'height': 144},
222 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
223 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
224 '34': {'ext': 'flv', 'width': 640, 'height': 360},
225 '35': {'ext': 'flv', 'width': 854, 'height': 480},
226 '36': {'ext': '3gp', 'width': 320, 'height': 240},
227 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
228 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
229 '43': {'ext': 'webm', 'width': 640, 'height': 360},
230 '44': {'ext': 'webm', 'width': 854, 'height': 480},
231 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
232 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
233
234
235 # 3d videos
236 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
237 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
238 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
239 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
240 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
241 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
242 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
243
244 # Apple HTTP Live Streaming
245 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
246 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
247 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
248 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
249 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
250 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
251 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
252
253 # DASH mp4 video
254 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
255 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
256 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
257 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
258 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
259 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
260 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
261 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
262 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
263 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
264 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
265
266 # Dash mp4 audio
267 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
268 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
269 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
270
271 # Dash webm
272 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
273 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
274 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
275 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
276 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
277 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
278 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
279 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
280 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
281 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
282 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
283 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
284 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
285 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
286 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
287 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
288 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
289 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
290 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
291
292 # Dash webm audio
293 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
294 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
295
296 # Dash webm audio with opus inside
297 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
298 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
299 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
300
301 # RTMP (unnamed)
302 '_rtmp': {'protocol': 'rtmp'},
303 }
304
305 IE_NAME = 'youtube'
306 _TESTS = [
307 {
308 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
309 'info_dict': {
310 'id': 'BaW_jenozKc',
311 'ext': 'mp4',
312 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
313 'uploader': 'Philipp Hagemeister',
314 'uploader_id': 'phihag',
315 'upload_date': '20121002',
316 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
317 'categories': ['Science & Technology'],
318 'like_count': int,
319 'dislike_count': int,
320 }
321 },
322 {
323 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
324 'note': 'Test generic use_cipher_signature video (#897)',
325 'info_dict': {
326 'id': 'UxxajLWwzqY',
327 'ext': 'mp4',
328 'upload_date': '20120506',
329 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
330 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
331 'uploader': 'Icona Pop',
332 'uploader_id': 'IconaPop',
333 }
334 },
335 {
336 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
337 'note': 'Test VEVO video with age protection (#956)',
338 'info_dict': {
339 'id': '07FYdnEawAQ',
340 'ext': 'mp4',
341 'upload_date': '20130703',
342 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
343 'description': 'md5:64249768eec3bc4276236606ea996373',
344 'uploader': 'justintimberlakeVEVO',
345 'uploader_id': 'justintimberlakeVEVO',
346 }
347 },
348 {
349 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
350 'note': 'Embed-only video (#1746)',
351 'info_dict': {
352 'id': 'yZIXLfi8CZQ',
353 'ext': 'mp4',
354 'upload_date': '20120608',
355 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
356 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
357 'uploader': 'SET India',
358 'uploader_id': 'setindia'
359 }
360 },
361 {
362 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
363 'note': '256k DASH audio (format 141) via DASH manifest',
364 'info_dict': {
365 'id': 'a9LDPn-MO4I',
366 'ext': 'm4a',
367 'upload_date': '20121002',
368 'uploader_id': '8KVIDEO',
369 'description': '',
370 'uploader': '8KVIDEO',
371 'title': 'UHDTV TEST 8K VIDEO.mp4'
372 },
373 'params': {
374 'youtube_include_dash_manifest': True,
375 'format': '141',
376 },
377 },
378 # DASH manifest with encrypted signature
379 {
380 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
381 'info_dict': {
382 'id': 'IB3lcPjvWLA',
383 'ext': 'm4a',
384 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
385 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
386 'uploader': 'AfrojackVEVO',
387 'uploader_id': 'AfrojackVEVO',
388 'upload_date': '20131011',
389 },
390 'params': {
391 'youtube_include_dash_manifest': True,
392 'format': '141',
393 },
394 },
395 # Controversy video
396 {
397 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
398 'info_dict': {
399 'id': 'T4XJQO3qol8',
400 'ext': 'mp4',
401 'upload_date': '20100909',
402 'uploader': 'The Amazing Atheist',
403 'uploader_id': 'TheAmazingAtheist',
404 'title': 'Burning Everyone\'s Koran',
405 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
406 }
407 },
408 # Normal age-gate video (No vevo, embed allowed)
409 {
410 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
411 'info_dict': {
412 'id': 'HtVdAasjOgU',
413 'ext': 'mp4',
414 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
415 'description': 'md5:eca57043abae25130f58f655ad9a7771',
416 'uploader': 'The Witcher',
417 'uploader_id': 'WitcherGame',
418 'upload_date': '20140605',
419 },
420 },
421 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
422 {
423 'url': '__2ABJjxzNo',
424 'info_dict': {
425 'id': '__2ABJjxzNo',
426 'ext': 'mp4',
427 'upload_date': '20100430',
428 'uploader_id': 'deadmau5',
429 'description': 'md5:12c56784b8032162bb936a5f76d55360',
430 'uploader': 'deadmau5',
431 'title': 'Deadmau5 - Some Chords (HD)',
432 },
433 'expected_warnings': [
434 'DASH manifest missing',
435 ]
436 },
437 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
438 {
439 'url': 'lqQg6PlCWgI',
440 'info_dict': {
441 'id': 'lqQg6PlCWgI',
442 'ext': 'mp4',
443
444 }
445 }
446 ]
447
448 def __init__(self, *args, **kwargs):
449 super(YoutubeIE, self).__init__(*args, **kwargs)
450 self._player_cache = {}
451
452 def report_video_info_webpage_download(self, video_id):
453 """Report attempt to download video info webpage."""
454 self.to_screen('%s: Downloading video info webpage' % video_id)
455
456 def report_information_extraction(self, video_id):
457 """Report attempt to extract video information."""
458 self.to_screen('%s: Extracting video information' % video_id)
459
460 def report_unavailable_format(self, video_id, format):
461 """Report extracted video URL."""
462 self.to_screen('%s: Format %s not available' % (video_id, format))
463
464 def report_rtmp_download(self):
465 """Indicate the download will use the RTMP protocol."""
466 self.to_screen('RTMP download detected')
467
468 def _signature_cache_id(self, example_sig):
469 """ Return a string representation of a signature """
470 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
471
472 def _extract_signature_function(self, video_id, player_url, example_sig):
473 id_m = re.match(
474 r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
475 player_url)
476 if not id_m:
477 raise ExtractorError('Cannot identify player %r' % player_url)
478 player_type = id_m.group('ext')
479 player_id = id_m.group('id')
480
481 # Read from filesystem cache
482 func_id = '%s_%s_%s' % (
483 player_type, player_id, self._signature_cache_id(example_sig))
484 assert os.path.basename(func_id) == func_id
485
486 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
487 if cache_spec is not None:
488 return lambda s: ''.join(s[i] for i in cache_spec)
489
490 if player_type == 'js':
491 code = self._download_webpage(
492 player_url, video_id,
493 note='Downloading %s player %s' % (player_type, player_id),
494 errnote='Download of %s failed' % player_url)
495 res = self._parse_sig_js(code)
496 elif player_type == 'swf':
497 urlh = self._request_webpage(
498 player_url, video_id,
499 note='Downloading %s player %s' % (player_type, player_id),
500 errnote='Download of %s failed' % player_url)
501 code = urlh.read()
502 res = self._parse_sig_swf(code)
503 else:
504 assert False, 'Invalid player type %r' % player_type
505
506 if cache_spec is None:
507 test_string = ''.join(map(compat_chr, range(len(example_sig))))
508 cache_res = res(test_string)
509 cache_spec = [ord(c) for c in cache_res]
510
511 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
512 return res
513
514 def _print_sig_code(self, func, example_sig):
515 def gen_sig_code(idxs):
516 def _genslice(start, end, step):
517 starts = '' if start == 0 else str(start)
518 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
519 steps = '' if step == 1 else (':%d' % step)
520 return 's[%s%s%s]' % (starts, ends, steps)
521
522 step = None
523 start = '(Never used)' # Quelch pyflakes warnings - start will be
524 # set as soon as step is set
525 for i, prev in zip(idxs[1:], idxs[:-1]):
526 if step is not None:
527 if i - prev == step:
528 continue
529 yield _genslice(start, prev, step)
530 step = None
531 continue
532 if i - prev in [-1, 1]:
533 step = i - prev
534 start = prev
535 continue
536 else:
537 yield 's[%d]' % prev
538 if step is None:
539 yield 's[%d]' % i
540 else:
541 yield _genslice(start, i, step)
542
543 test_string = ''.join(map(compat_chr, range(len(example_sig))))
544 cache_res = func(test_string)
545 cache_spec = [ord(c) for c in cache_res]
546 expr_code = ' + '.join(gen_sig_code(cache_spec))
547 signature_id_tuple = '(%s)' % (
548 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
549 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
550 ' return %s\n') % (signature_id_tuple, expr_code)
551 self.to_screen('Extracted signature function:\n' + code)
552
553 def _parse_sig_js(self, jscode):
554 funcname = self._search_regex(
555 r'\.sig\|\|([a-zA-Z0-9]+)\(', jscode,
556 'Initial JS player signature function name')
557
558 jsi = JSInterpreter(jscode)
559 initial_function = jsi.extract_function(funcname)
560 return lambda s: initial_function([s])
561
562 def _parse_sig_swf(self, file_contents):
563 swfi = SWFInterpreter(file_contents)
564 TARGET_CLASSNAME = 'SignatureDecipher'
565 searched_class = swfi.extract_class(TARGET_CLASSNAME)
566 initial_function = swfi.extract_function(searched_class, 'decipher')
567 return lambda s: initial_function([s])
568
569 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
570 """Turn the encrypted s field into a working signature"""
571
572 if player_url is None:
573 raise ExtractorError('Cannot decrypt signature without player_url')
574
575 if player_url.startswith('//'):
576 player_url = 'https:' + player_url
577 try:
578 player_id = (player_url, self._signature_cache_id(s))
579 if player_id not in self._player_cache:
580 func = self._extract_signature_function(
581 video_id, player_url, s
582 )
583 self._player_cache[player_id] = func
584 func = self._player_cache[player_id]
585 if self._downloader.params.get('youtube_print_sig_code'):
586 self._print_sig_code(func, s)
587 return func(s)
588 except Exception as e:
589 tb = traceback.format_exc()
590 raise ExtractorError(
591 'Signature extraction failed: ' + tb, cause=e)
592
593 def _get_available_subtitles(self, video_id, webpage):
594 try:
595 sub_list = self._download_webpage(
596 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
597 video_id, note=False)
598 except ExtractorError as err:
599 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
600 return {}
601 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
602
603 sub_lang_list = {}
604 for l in lang_list:
605 lang = l[1]
606 if lang in sub_lang_list:
607 continue
608 params = compat_urllib_parse.urlencode({
609 'lang': lang,
610 'v': video_id,
611 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
612 'name': unescapeHTML(l[0]).encode('utf-8'),
613 })
614 url = 'https://www.youtube.com/api/timedtext?' + params
615 sub_lang_list[lang] = url
616 if not sub_lang_list:
617 self._downloader.report_warning('video doesn\'t have subtitles')
618 return {}
619 return sub_lang_list
620
621 def _get_available_automatic_caption(self, video_id, webpage):
622 """We need the webpage for getting the captions url, pass it as an
623 argument to speed up the process."""
624 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
625 self.to_screen('%s: Looking for automatic captions' % video_id)
626 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
627 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
628 if mobj is None:
629 self._downloader.report_warning(err_msg)
630 return {}
631 player_config = json.loads(mobj.group(1))
632 try:
633 args = player_config['args']
634 caption_url = args['ttsurl']
635 timestamp = args['timestamp']
636 # We get the available subtitles
637 list_params = compat_urllib_parse.urlencode({
638 'type': 'list',
639 'tlangs': 1,
640 'asrs': 1,
641 })
642 list_url = caption_url + '&' + list_params
643 caption_list = self._download_xml(list_url, video_id)
644 original_lang_node = caption_list.find('track')
645 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr':
646 self._downloader.report_warning('Video doesn\'t have automatic captions')
647 return {}
648 original_lang = original_lang_node.attrib['lang_code']
649
650 sub_lang_list = {}
651 for lang_node in caption_list.findall('target'):
652 sub_lang = lang_node.attrib['lang_code']
653 params = compat_urllib_parse.urlencode({
654 'lang': original_lang,
655 'tlang': sub_lang,
656 'fmt': sub_format,
657 'ts': timestamp,
658 'kind': 'asr',
659 })
660 sub_lang_list[sub_lang] = caption_url + '&' + params
661 return sub_lang_list
662 # An extractor error can be raise by the download process if there are
663 # no automatic captions but there are subtitles
664 except (KeyError, ExtractorError):
665 self._downloader.report_warning(err_msg)
666 return {}
667
668 @classmethod
669 def extract_id(cls, url):
670 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
671 if mobj is None:
672 raise ExtractorError('Invalid URL: %s' % url)
673 video_id = mobj.group(2)
674 return video_id
675
676 def _extract_from_m3u8(self, manifest_url, video_id):
677 url_map = {}
678
679 def _get_urls(_manifest):
680 lines = _manifest.split('\n')
681 urls = filter(lambda l: l and not l.startswith('#'),
682 lines)
683 return urls
684 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
685 formats_urls = _get_urls(manifest)
686 for format_url in formats_urls:
687 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
688 url_map[itag] = format_url
689 return url_map
690
691 def _extract_annotations(self, video_id):
692 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
693 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
694
695 def _parse_dash_manifest(
696 self, video_id, dash_manifest_url, player_url, age_gate):
697 def decrypt_sig(mobj):
698 s = mobj.group(1)
699 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
700 return '/signature/%s' % dec_s
701 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
702 dash_doc = self._download_xml(
703 dash_manifest_url, video_id,
704 note='Downloading DASH manifest',
705 errnote='Could not download DASH manifest')
706
707 formats = []
708 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
709 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
710 if url_el is None:
711 continue
712 format_id = r.attrib['id']
713 video_url = url_el.text
714 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
715 f = {
716 'format_id': format_id,
717 'url': video_url,
718 'width': int_or_none(r.attrib.get('width')),
719 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
720 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
721 'filesize': filesize,
722 'fps': int_or_none(r.attrib.get('frameRate')),
723 }
724 try:
725 existing_format = next(
726 fo for fo in formats
727 if fo['format_id'] == format_id)
728 except StopIteration:
729 f.update(self._formats.get(format_id, {}))
730 formats.append(f)
731 else:
732 existing_format.update(f)
733 return formats
734
735 def _real_extract(self, url):
736 proto = (
737 'http' if self._downloader.params.get('prefer_insecure', False)
738 else 'https')
739
740 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
741 mobj = re.search(self._NEXT_URL_RE, url)
742 if mobj:
743 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
744 video_id = self.extract_id(url)
745
746 # Get video webpage
747 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
748 video_webpage = self._download_webpage(url, video_id)
749
750 # Attempt to extract SWF player URL
751 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
752 if mobj is not None:
753 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
754 else:
755 player_url = None
756
757 # Get video info
758 if re.search(r'player-age-gate-content">', video_webpage) is not None:
759 age_gate = True
760 # We simulate the access to the video from www.youtube.com/v/{video_id}
761 # this can be viewed without login into Youtube
762 data = compat_urllib_parse.urlencode({
763 'video_id': video_id,
764 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
765 'sts': self._search_regex(
766 r'"sts"\s*:\s*(\d+)', video_webpage, 'sts', default=''),
767 })
768 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
769 video_info_webpage = self._download_webpage(
770 video_info_url, video_id,
771 note='Refetching age-gated info webpage',
772 errnote='unable to download video info webpage')
773 video_info = compat_parse_qs(video_info_webpage)
774 else:
775 age_gate = False
776 try:
777 # Try looking directly into the video webpage
778 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
779 if not mobj:
780 raise ValueError('Could not find ytplayer.config') # caught below
781 json_code = uppercase_escape(mobj.group(1))
782 ytplayer_config = json.loads(json_code)
783 args = ytplayer_config['args']
784 # Convert to the same format returned by compat_parse_qs
785 video_info = dict((k, [v]) for k, v in args.items())
786 if 'url_encoded_fmt_stream_map' not in args:
787 raise ValueError('No stream_map present') # caught below
788 except ValueError:
789 # We fallback to the get_video_info pages (used by the embed page)
790 self.report_video_info_webpage_download(video_id)
791 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
792 video_info_url = (
793 '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
794 % (proto, video_id, el_type))
795 video_info_webpage = self._download_webpage(
796 video_info_url,
797 video_id, note=False,
798 errnote='unable to download video info webpage')
799 video_info = compat_parse_qs(video_info_webpage)
800 if 'token' in video_info:
801 break
802 if 'token' not in video_info:
803 if 'reason' in video_info:
804 raise ExtractorError(
805 'YouTube said: %s' % video_info['reason'][0],
806 expected=True, video_id=video_id)
807 else:
808 raise ExtractorError(
809 '"token" parameter not in video info for unknown reason',
810 video_id=video_id)
811
812 if 'view_count' in video_info:
813 view_count = int(video_info['view_count'][0])
814 else:
815 view_count = None
816
817 # Check for "rental" videos
818 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
819 raise ExtractorError('"rental" videos not supported')
820
821 # Start extracting information
822 self.report_information_extraction(video_id)
823
824 # uploader
825 if 'author' not in video_info:
826 raise ExtractorError('Unable to extract uploader name')
827 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
828
829 # uploader_id
830 video_uploader_id = None
831 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
832 if mobj is not None:
833 video_uploader_id = mobj.group(1)
834 else:
835 self._downloader.report_warning('unable to extract uploader nickname')
836
837 # title
838 if 'title' in video_info:
839 video_title = video_info['title'][0]
840 else:
841 self._downloader.report_warning('Unable to extract video title')
842 video_title = '_'
843
844 # thumbnail image
845 # We try first to get a high quality image:
846 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
847 video_webpage, re.DOTALL)
848 if m_thumb is not None:
849 video_thumbnail = m_thumb.group(1)
850 elif 'thumbnail_url' not in video_info:
851 self._downloader.report_warning('unable to extract video thumbnail')
852 video_thumbnail = None
853 else: # don't panic if we can't find it
854 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
855
856 # upload date
857 upload_date = None
858 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
859 if mobj is None:
860 mobj = re.search(
861 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
862 video_webpage)
863 if mobj is not None:
864 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
865 upload_date = unified_strdate(upload_date)
866
867 m_cat_container = self._search_regex(
868 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
869 video_webpage, 'categories', fatal=False)
870 if m_cat_container:
871 category = self._html_search_regex(
872 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
873 default=None)
874 video_categories = None if category is None else [category]
875 else:
876 video_categories = None
877
878 # description
879 video_description = get_element_by_id("eow-description", video_webpage)
880 if video_description:
881 video_description = re.sub(r'''(?x)
882 <a\s+
883 (?:[a-zA-Z-]+="[^"]+"\s+)*?
884 title="([^"]+)"\s+
885 (?:[a-zA-Z-]+="[^"]+"\s+)*?
886 class="yt-uix-redirect-link"\s*>
887 [^<]+
888 </a>
889 ''', r'\1', video_description)
890 video_description = clean_html(video_description)
891 else:
892 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
893 if fd_mobj:
894 video_description = unescapeHTML(fd_mobj.group(1))
895 else:
896 video_description = ''
897
898 def _extract_count(count_name):
899 count = self._search_regex(
900 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
901 video_webpage, count_name, default=None)
902 if count is not None:
903 return int(count.replace(',', ''))
904 return None
905 like_count = _extract_count('like')
906 dislike_count = _extract_count('dislike')
907
908 # subtitles
909 video_subtitles = self.extract_subtitles(video_id, video_webpage)
910
911 if self._downloader.params.get('listsubtitles', False):
912 self._list_available_subtitles(video_id, video_webpage)
913 return
914
915 if 'length_seconds' not in video_info:
916 self._downloader.report_warning('unable to extract video duration')
917 video_duration = None
918 else:
919 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
920
921 # annotations
922 video_annotations = None
923 if self._downloader.params.get('writeannotations', False):
924 video_annotations = self._extract_annotations(video_id)
925
926 def _map_to_format_list(urlmap):
927 formats = []
928 for itag, video_real_url in urlmap.items():
929 dct = {
930 'format_id': itag,
931 'url': video_real_url,
932 'player_url': player_url,
933 }
934 if itag in self._formats:
935 dct.update(self._formats[itag])
936 formats.append(dct)
937 return formats
938
939 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
940 self.report_rtmp_download()
941 formats = [{
942 'format_id': '_rtmp',
943 'protocol': 'rtmp',
944 'url': video_info['conn'][0],
945 'player_url': player_url,
946 }]
947 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
948 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
949 if 'rtmpe%3Dyes' in encoded_url_map:
950 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
951 url_map = {}
952 for url_data_str in encoded_url_map.split(','):
953 url_data = compat_parse_qs(url_data_str)
954 if 'itag' not in url_data or 'url' not in url_data:
955 continue
956 format_id = url_data['itag'][0]
957 url = url_data['url'][0]
958
959 if 'sig' in url_data:
960 url += '&signature=' + url_data['sig'][0]
961 elif 's' in url_data:
962 encrypted_sig = url_data['s'][0]
963
964 if not age_gate:
965 jsplayer_url_json = self._search_regex(
966 r'"assets":.+?"js":\s*("[^"]+")',
967 video_webpage, 'JS player URL')
968 player_url = json.loads(jsplayer_url_json)
969 if player_url is None:
970 player_url_json = self._search_regex(
971 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
972 video_webpage, 'age gate player URL')
973 player_url = json.loads(player_url_json)
974
975 if self._downloader.params.get('verbose'):
976 if player_url is None:
977 player_version = 'unknown'
978 player_desc = 'unknown'
979 else:
980 if player_url.endswith('swf'):
981 player_version = self._search_regex(
982 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
983 'flash player', fatal=False)
984 player_desc = 'flash player %s' % player_version
985 else:
986 player_version = self._search_regex(
987 r'html5player-([^/]+?)(?:/html5player)?\.js',
988 player_url,
989 'html5 player', fatal=False)
990 player_desc = 'html5 player %s' % player_version
991
992 parts_sizes = self._signature_cache_id(encrypted_sig)
993 self.to_screen('{%s} signature length %s, %s' %
994 (format_id, parts_sizes, player_desc))
995
996 signature = self._decrypt_signature(
997 encrypted_sig, video_id, player_url, age_gate)
998 url += '&signature=' + signature
999 if 'ratebypass' not in url:
1000 url += '&ratebypass=yes'
1001 url_map[format_id] = url
1002 formats = _map_to_format_list(url_map)
1003 elif video_info.get('hlsvp'):
1004 manifest_url = video_info['hlsvp'][0]
1005 url_map = self._extract_from_m3u8(manifest_url, video_id)
1006 formats = _map_to_format_list(url_map)
1007 else:
1008 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1009
1010 # Look for the DASH manifest
1011 if self._downloader.params.get('youtube_include_dash_manifest', True):
1012 dash_mpd = video_info.get('dashmpd')
1013 if not dash_mpd:
1014 self.report_warning('%s: DASH manifest missing' % video_id)
1015 else:
1016 dash_manifest_url = dash_mpd[0]
1017 try:
1018 dash_formats = self._parse_dash_manifest(
1019 video_id, dash_manifest_url, player_url, age_gate)
1020 except (ExtractorError, KeyError) as e:
1021 self.report_warning(
1022 'Skipping DASH manifest: %r' % e, video_id)
1023 else:
1024 formats.extend(dash_formats)
1025
1026 self._sort_formats(formats)
1027
1028 return {
1029 'id': video_id,
1030 'uploader': video_uploader,
1031 'uploader_id': video_uploader_id,
1032 'upload_date': upload_date,
1033 'title': video_title,
1034 'thumbnail': video_thumbnail,
1035 'description': video_description,
1036 'categories': video_categories,
1037 'subtitles': video_subtitles,
1038 'duration': video_duration,
1039 'age_limit': 18 if age_gate else 0,
1040 'annotations': video_annotations,
1041 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1042 'view_count': view_count,
1043 'like_count': like_count,
1044 'dislike_count': dislike_count,
1045 'formats': formats,
1046 }
1047
1048
1049 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1050 IE_DESC = 'YouTube.com playlists'
1051 _VALID_URL = r"""(?x)(?:
1052 (?:https?://)?
1053 (?:\w+\.)?
1054 youtube\.com/
1055 (?:
1056 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
1057 \? (?:.*?&)*? (?:p|a|list)=
1058 | p/
1059 )
1060 (
1061 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1062 # Top tracks, they can also include dots
1063 |(?:MC)[\w\.]*
1064 )
1065 .*
1066 |
1067 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1068 )"""
1069 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1070 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1071 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1072 IE_NAME = 'youtube:playlist'
1073 _TESTS = [{
1074 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1075 'info_dict': {
1076 'title': 'ytdl test PL',
1077 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1078 },
1079 'playlist_count': 3,
1080 }, {
1081 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1082 'info_dict': {
1083 'title': 'YDL_Empty_List',
1084 },
1085 'playlist_count': 0,
1086 }, {
1087 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1088 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1089 'info_dict': {
1090 'title': '29C3: Not my department',
1091 },
1092 'playlist_count': 95,
1093 }, {
1094 'note': 'issue #673',
1095 'url': 'PLBB231211A4F62143',
1096 'info_dict': {
1097 'title': '[OLD]Team Fortress 2 (Class-based LP)',
1098 },
1099 'playlist_mincount': 26,
1100 }, {
1101 'note': 'Large playlist',
1102 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1103 'info_dict': {
1104 'title': 'Uploads from Cauchemar',
1105 },
1106 'playlist_mincount': 799,
1107 }, {
1108 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1109 'info_dict': {
1110 'title': 'YDL_safe_search',
1111 },
1112 'playlist_count': 2,
1113 }, {
1114 'note': 'embedded',
1115 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1116 'playlist_count': 4,
1117 'info_dict': {
1118 'title': 'JODA15',
1119 }
1120 }, {
1121 'note': 'Embedded SWF player',
1122 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1123 'playlist_count': 4,
1124 'info_dict': {
1125 'title': 'JODA7',
1126 }
1127 }]
1128
1129 def _real_initialize(self):
1130 self._login()
1131
1132 def _ids_to_results(self, ids):
1133 return [
1134 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1135 for vid_id in ids]
1136
1137 def _extract_mix(self, playlist_id):
1138 # The mixes are generated from a a single video
1139 # the id of the playlist is just 'RD' + video_id
1140 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1141 webpage = self._download_webpage(
1142 url, playlist_id, 'Downloading Youtube mix')
1143 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1144 title_span = (
1145 search_title('playlist-title') or
1146 search_title('title long-title') or
1147 search_title('title'))
1148 title = clean_html(title_span)
1149 ids = orderedSet(re.findall(
1150 r'''(?xs)data-video-username=".*?".*?
1151 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1152 webpage))
1153 url_results = self._ids_to_results(ids)
1154
1155 return self.playlist_result(url_results, playlist_id, title)
1156
1157 def _real_extract(self, url):
1158 # Extract playlist id
1159 mobj = re.match(self._VALID_URL, url)
1160 if mobj is None:
1161 raise ExtractorError('Invalid URL: %s' % url)
1162 playlist_id = mobj.group(1) or mobj.group(2)
1163
1164 # Check if it's a video-specific URL
1165 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1166 if 'v' in query_dict:
1167 video_id = query_dict['v'][0]
1168 if self._downloader.params.get('noplaylist'):
1169 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1170 return self.url_result(video_id, 'Youtube', video_id=video_id)
1171 else:
1172 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1173
1174 if playlist_id.startswith('RD'):
1175 # Mixes require a custom extraction process
1176 return self._extract_mix(playlist_id)
1177 if playlist_id.startswith('TL'):
1178 raise ExtractorError('For downloading YouTube.com top lists, use '
1179 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1180
1181 url = self._TEMPLATE_URL % playlist_id
1182 page = self._download_webpage(url, playlist_id)
1183 more_widget_html = content_html = page
1184
1185 # Check if the playlist exists or is private
1186 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1187 raise ExtractorError(
1188 'The playlist doesn\'t exist or is private, use --username or '
1189 '--netrc to access it.',
1190 expected=True)
1191
1192 # Extract the video ids from the playlist pages
1193 ids = []
1194
1195 for page_num in itertools.count(1):
1196 matches = re.finditer(self._VIDEO_RE, content_html)
1197 # We remove the duplicates and the link with index 0
1198 # (it's not the first video of the playlist)
1199 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1200 ids.extend(new_ids)
1201
1202 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1203 if not mobj:
1204 break
1205
1206 more = self._download_json(
1207 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1208 'Downloading page #%s' % page_num,
1209 transform_source=uppercase_escape)
1210 content_html = more['content_html']
1211 more_widget_html = more['load_more_widget_html']
1212
1213 playlist_title = self._html_search_regex(
1214 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1215 page, 'title')
1216
1217 url_results = self._ids_to_results(ids)
1218 return self.playlist_result(url_results, playlist_id, playlist_title)
1219
1220
1221 class YoutubeTopListIE(YoutubePlaylistIE):
1222 IE_NAME = 'youtube:toplist'
1223 IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1224 ' (Example: "yttoplist:music:Top Tracks")')
1225 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1226 _TESTS = [{
1227 'url': 'yttoplist:music:Trending',
1228 'playlist_mincount': 5,
1229 'skip': 'Only works for logged-in users',
1230 }]
1231
1232 def _real_extract(self, url):
1233 mobj = re.match(self._VALID_URL, url)
1234 channel = mobj.group('chann')
1235 title = mobj.group('title')
1236 query = compat_urllib_parse.urlencode({'title': title})
1237 channel_page = self._download_webpage(
1238 'https://www.youtube.com/%s' % channel, title)
1239 link = self._html_search_regex(
1240 r'''(?x)
1241 <a\s+href="([^"]+)".*?>\s*
1242 <span\s+class="branded-page-module-title-text">\s*
1243 <span[^>]*>.*?%s.*?</span>''' % re.escape(query),
1244 channel_page, 'list')
1245 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1246
1247 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1248 ids = []
1249 # sometimes the webpage doesn't contain the videos
1250 # retry until we get them
1251 for i in itertools.count(0):
1252 msg = 'Downloading Youtube mix'
1253 if i > 0:
1254 msg += ', retry #%d' % i
1255
1256 webpage = self._download_webpage(url, title, msg)
1257 ids = orderedSet(re.findall(video_re, webpage))
1258 if ids:
1259 break
1260 url_results = self._ids_to_results(ids)
1261 return self.playlist_result(url_results, playlist_title=title)
1262
1263
1264 class YoutubeChannelIE(InfoExtractor):
1265 IE_DESC = 'YouTube.com channels'
1266 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
1267 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1268 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1269 IE_NAME = 'youtube:channel'
1270 _TESTS = [{
1271 'note': 'paginated channel',
1272 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1273 'playlist_mincount': 91,
1274 }]
1275
1276 def extract_videos_from_page(self, page):
1277 ids_in_page = []
1278 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1279 if mobj.group(1) not in ids_in_page:
1280 ids_in_page.append(mobj.group(1))
1281 return ids_in_page
1282
1283 def _real_extract(self, url):
1284 channel_id = self._match_id(url)
1285
1286 video_ids = []
1287 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1288 channel_page = self._download_webpage(url, channel_id)
1289 autogenerated = re.search(r'''(?x)
1290 class="[^"]*?(?:
1291 channel-header-autogenerated-label|
1292 yt-channel-title-autogenerated
1293 )[^"]*"''', channel_page) is not None
1294
1295 if autogenerated:
1296 # The videos are contained in a single page
1297 # the ajax pages can't be used, they are empty
1298 video_ids = self.extract_videos_from_page(channel_page)
1299 entries = [
1300 self.url_result(video_id, 'Youtube', video_id=video_id)
1301 for video_id in video_ids]
1302 return self.playlist_result(entries, channel_id)
1303
1304 def _entries():
1305 for pagenum in itertools.count(1):
1306 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1307 page = self._download_json(
1308 url, channel_id, note='Downloading page #%s' % pagenum,
1309 transform_source=uppercase_escape)
1310
1311 ids_in_page = self.extract_videos_from_page(page['content_html'])
1312 for video_id in ids_in_page:
1313 yield self.url_result(
1314 video_id, 'Youtube', video_id=video_id)
1315
1316 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1317 break
1318
1319 return self.playlist_result(_entries(), channel_id)
1320
1321
1322 class YoutubeUserIE(InfoExtractor):
1323 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
1324 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
1325 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1326 _GDATA_PAGE_SIZE = 50
1327 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1328 IE_NAME = 'youtube:user'
1329
1330 _TESTS = [{
1331 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1332 'playlist_mincount': 320,
1333 'info_dict': {
1334 'title': 'TheLinuxFoundation',
1335 }
1336 }, {
1337 'url': 'ytuser:phihag',
1338 'only_matching': True,
1339 }]
1340
1341 @classmethod
1342 def suitable(cls, url):
1343 # Don't return True if the url can be extracted with other youtube
1344 # extractor, the regex would is too permissive and it would match.
1345 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1346 if any(ie.suitable(url) for ie in other_ies):
1347 return False
1348 else:
1349 return super(YoutubeUserIE, cls).suitable(url)
1350
1351 def _real_extract(self, url):
1352 username = self._match_id(url)
1353
1354 # Download video ids using YouTube Data API. Result size per
1355 # query is limited (currently to 50 videos) so we need to query
1356 # page by page until there are no video ids - it means we got
1357 # all of them.
1358
1359 def download_page(pagenum):
1360 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1361
1362 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1363 page = self._download_webpage(
1364 gdata_url, username,
1365 'Downloading video ids from %d to %d' % (
1366 start_index, start_index + self._GDATA_PAGE_SIZE))
1367
1368 try:
1369 response = json.loads(page)
1370 except ValueError as err:
1371 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
1372 if 'entry' not in response['feed']:
1373 return
1374
1375 # Extract video identifiers
1376 entries = response['feed']['entry']
1377 for entry in entries:
1378 title = entry['title']['$t']
1379 video_id = entry['id']['$t'].split('/')[-1]
1380 yield {
1381 '_type': 'url',
1382 'url': video_id,
1383 'ie_key': 'Youtube',
1384 'id': video_id,
1385 'title': title,
1386 }
1387 url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
1388
1389 return self.playlist_result(url_results, playlist_title=username)
1390
1391
1392 class YoutubeSearchIE(SearchInfoExtractor):
1393 IE_DESC = 'YouTube.com searches'
1394 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1395 _MAX_RESULTS = 1000
1396 IE_NAME = 'youtube:search'
1397 _SEARCH_KEY = 'ytsearch'
1398
1399 def _get_n_results(self, query, n):
1400 """Get a specified number of results for a query"""
1401
1402 video_ids = []
1403 pagenum = 0
1404 limit = n
1405 PAGE_SIZE = 50
1406
1407 while (PAGE_SIZE * pagenum) < limit:
1408 result_url = self._API_URL % (
1409 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1410 (PAGE_SIZE * pagenum) + 1)
1411 data_json = self._download_webpage(
1412 result_url, video_id='query "%s"' % query,
1413 note='Downloading page %s' % (pagenum + 1),
1414 errnote='Unable to download API page')
1415 data = json.loads(data_json)
1416 api_response = data['data']
1417
1418 if 'items' not in api_response:
1419 raise ExtractorError(
1420 '[youtube] No video results', expected=True)
1421
1422 new_ids = list(video['id'] for video in api_response['items'])
1423 video_ids += new_ids
1424
1425 limit = min(n, api_response['totalItems'])
1426 pagenum += 1
1427
1428 if len(video_ids) > n:
1429 video_ids = video_ids[:n]
1430 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1431 for video_id in video_ids]
1432 return self.playlist_result(videos, query)
1433
1434
1435 class YoutubeSearchDateIE(YoutubeSearchIE):
1436 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1437 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1438 _SEARCH_KEY = 'ytsearchdate'
1439 IE_DESC = 'YouTube.com searches, newest videos first'
1440
1441
1442 class YoutubeSearchURLIE(InfoExtractor):
1443 IE_DESC = 'YouTube.com search URLs'
1444 IE_NAME = 'youtube:search_url'
1445 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1446 _TESTS = [{
1447 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1448 'playlist_mincount': 5,
1449 'info_dict': {
1450 'title': 'youtube-dl test video',
1451 }
1452 }]
1453
1454 def _real_extract(self, url):
1455 mobj = re.match(self._VALID_URL, url)
1456 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1457
1458 webpage = self._download_webpage(url, query)
1459 result_code = self._search_regex(
1460 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
1461
1462 part_codes = re.findall(
1463 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1464 entries = []
1465 for part_code in part_codes:
1466 part_title = self._html_search_regex(
1467 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1468 part_url_snippet = self._html_search_regex(
1469 r'(?s)href="([^"]+)"', part_code, 'item URL')
1470 part_url = compat_urlparse.urljoin(
1471 'https://www.youtube.com/', part_url_snippet)
1472 entries.append({
1473 '_type': 'url',
1474 'url': part_url,
1475 'title': part_title,
1476 })
1477
1478 return {
1479 '_type': 'playlist',
1480 'entries': entries,
1481 'title': query,
1482 }
1483
1484
1485 class YoutubeShowIE(InfoExtractor):
1486 IE_DESC = 'YouTube.com (multi-season) shows'
1487 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
1488 IE_NAME = 'youtube:show'
1489 _TESTS = [{
1490 'url': 'http://www.youtube.com/show/airdisasters',
1491 'playlist_mincount': 3,
1492 'info_dict': {
1493 'id': 'airdisasters',
1494 'title': 'Air Disasters',
1495 }
1496 }]
1497
1498 def _real_extract(self, url):
1499 mobj = re.match(self._VALID_URL, url)
1500 playlist_id = mobj.group('id')
1501 webpage = self._download_webpage(
1502 url, playlist_id, 'Downloading show webpage')
1503 # There's one playlist for each season of the show
1504 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1505 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1506 entries = [
1507 self.url_result(
1508 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1509 for season in m_seasons
1510 ]
1511 title = self._og_search_title(webpage, fatal=False)
1512
1513 return {
1514 '_type': 'playlist',
1515 'id': playlist_id,
1516 'title': title,
1517 'entries': entries,
1518 }
1519
1520
1521 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1522 """
1523 Base class for extractors that fetch info from
1524 http://www.youtube.com/feed_ajax
1525 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1526 """
1527 _LOGIN_REQUIRED = True
1528 # use action_load_personal_feed instead of action_load_system_feed
1529 _PERSONAL_FEED = False
1530
1531 @property
1532 def _FEED_TEMPLATE(self):
1533 action = 'action_load_system_feed'
1534 if self._PERSONAL_FEED:
1535 action = 'action_load_personal_feed'
1536 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1537
1538 @property
1539 def IE_NAME(self):
1540 return 'youtube:%s' % self._FEED_NAME
1541
1542 def _real_initialize(self):
1543 self._login()
1544
1545 def _real_extract(self, url):
1546 feed_entries = []
1547 paging = 0
1548 for i in itertools.count(1):
1549 info = self._download_json(self._FEED_TEMPLATE % paging,
1550 '%s feed' % self._FEED_NAME,
1551 'Downloading page %s' % i)
1552 feed_html = info.get('feed_html') or info.get('content_html')
1553 load_more_widget_html = info.get('load_more_widget_html') or feed_html
1554 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1555 ids = orderedSet(m.group(1) for m in m_ids)
1556 feed_entries.extend(
1557 self.url_result(video_id, 'Youtube', video_id=video_id)
1558 for video_id in ids)
1559 mobj = re.search(
1560 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1561 load_more_widget_html)
1562 if mobj is None:
1563 break
1564 paging = mobj.group('paging')
1565 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1566
1567
1568 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1569 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
1570 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1571 _FEED_NAME = 'recommended'
1572 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1573
1574
1575 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1576 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
1577 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1578 _FEED_NAME = 'watch_later'
1579 _PLAYLIST_TITLE = 'Youtube Watch Later'
1580 _PERSONAL_FEED = True
1581
1582
1583 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1584 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
1585 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1586 _FEED_NAME = 'history'
1587 _PERSONAL_FEED = True
1588 _PLAYLIST_TITLE = 'Youtube Watch History'
1589
1590
1591 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1592 IE_NAME = 'youtube:favorites'
1593 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
1594 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1595 _LOGIN_REQUIRED = True
1596
1597 def _real_extract(self, url):
1598 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1599 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
1600 return self.url_result(playlist_id, 'YoutubePlaylist')
1601
1602
1603 class YoutubeSubscriptionsIE(YoutubePlaylistIE):
1604 IE_NAME = 'youtube:subscriptions'
1605 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1606 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1607 _TESTS = []
1608
1609 def _real_extract(self, url):
1610 title = 'Youtube Subscriptions'
1611 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1612
1613 # The extraction process is the same as for playlists, but the regex
1614 # for the video ids doesn't contain an index
1615 ids = []
1616 more_widget_html = content_html = page
1617
1618 for page_num in itertools.count(1):
1619 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1620 new_ids = orderedSet(matches)
1621 ids.extend(new_ids)
1622
1623 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1624 if not mobj:
1625 break
1626
1627 more = self._download_json(
1628 'https://youtube.com/%s' % mobj.group('more'), title,
1629 'Downloading page #%s' % page_num,
1630 transform_source=uppercase_escape)
1631 content_html = more['content_html']
1632 more_widget_html = more['load_more_widget_html']
1633
1634 return {
1635 '_type': 'playlist',
1636 'title': title,
1637 'entries': self._ids_to_results(ids),
1638 }
1639
1640
1641 class YoutubeTruncatedURLIE(InfoExtractor):
1642 IE_NAME = 'youtube:truncated_url'
1643 IE_DESC = False # Do not list
1644 _VALID_URL = r'''(?x)
1645 (?:https?://)?[^/]+/watch\?(?:
1646 feature=[a-z_]+|
1647 annotation_id=annotation_[^&]+
1648 )?$|
1649 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1650 '''
1651
1652 _TESTS = [{
1653 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1654 'only_matching': True,
1655 }, {
1656 'url': 'http://www.youtube.com/watch?',
1657 'only_matching': True,
1658 }]
1659
1660 def _real_extract(self, url):
1661 raise ExtractorError(
1662 'Did you forget to quote the URL? Remember that & is a meta '
1663 'character in most shells, so you want to put the URL in quotes, '
1664 'like youtube-dl '
1665 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1666 ' or simply youtube-dl BaW_jenozKc .',
1667 expected=True)