]> jfr.im git - yt-dlp.git/blame_incremental - youtube_dl/extractor/youtube.py
[smotri:broadcast] Fix extraction
[yt-dlp.git] / youtube_dl / extractor / youtube.py
... / ...
CommitLineData
1# coding: utf-8
2
3from __future__ import unicode_literals
4
5
6import itertools
7import json
8import os.path
9import re
10import time
11import traceback
12
13from .common import InfoExtractor, SearchInfoExtractor
14from .subtitles import SubtitlesInfoExtractor
15from ..jsinterp import JSInterpreter
16from ..swfinterp import SWFInterpreter
17from ..utils import (
18 compat_chr,
19 compat_parse_qs,
20 compat_urllib_parse,
21 compat_urllib_request,
22 compat_urlparse,
23 compat_str,
24
25 clean_html,
26 get_element_by_id,
27 get_element_by_attribute,
28 ExtractorError,
29 int_or_none,
30 OnDemandPagedList,
31 unescapeHTML,
32 unified_strdate,
33 orderedSet,
34 uppercase_escape,
35)
36
37
38class YoutubeBaseInfoExtractor(InfoExtractor):
39 """Provide base functions for Youtube extractors"""
40 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
41 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
45
46 def _set_language(self):
47 self._set_cookie(
48 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
49 # YouTube sets the expire time to about two months
50 expire_time=time.time() + 2 * 30 * 24 * 3600)
51
52 def _login(self):
53 """
54 Attempt to log in to YouTube.
55 True is returned if successful or skipped.
56 False is returned if login failed.
57
58 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
59 """
60 (username, password) = self._get_login_info()
61 # No authentication to be performed
62 if username is None:
63 if self._LOGIN_REQUIRED:
64 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
65 return True
66
67 login_page = self._download_webpage(
68 self._LOGIN_URL, None,
69 note='Downloading login page',
70 errnote='unable to fetch login page', fatal=False)
71 if login_page is False:
72 return
73
74 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
75 login_page, 'Login GALX parameter')
76
77 # Log in
78 login_form_strs = {
79 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
80 'Email': username,
81 'GALX': galx,
82 'Passwd': password,
83
84 'PersistentCookie': 'yes',
85 '_utf8': '霱',
86 'bgresponse': 'js_disabled',
87 'checkConnection': '',
88 'checkedDomains': 'youtube',
89 'dnConn': '',
90 'pstMsg': '0',
91 'rmShown': '1',
92 'secTok': '',
93 'signIn': 'Sign in',
94 'timeStmp': '',
95 'service': 'youtube',
96 'uilel': '3',
97 'hl': 'en_US',
98 }
99
100 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
101 # chokes on unicode
102 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
103 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
104
105 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
106 login_results = self._download_webpage(
107 req, None,
108 note='Logging in', errnote='unable to log in', fatal=False)
109 if login_results is False:
110 return False
111
112 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
113 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
114
115 # Two-Factor
116 # TODO add SMS and phone call support - these require making a request and then prompting the user
117
118 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
119 tfa_code = self._get_tfa_info()
120
121 if tfa_code is None:
122 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
123 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
124 return False
125
126 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
127
128 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
129 if match is None:
130 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
131 secTok = match.group(1)
132 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
133 if match is None:
134 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
135 timeStmp = match.group(1)
136
137 tfa_form_strs = {
138 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
139 'smsToken': '',
140 'smsUserPin': tfa_code,
141 'smsVerifyPin': 'Verify',
142
143 'PersistentCookie': 'yes',
144 'checkConnection': '',
145 'checkedDomains': 'youtube',
146 'pstMsg': '1',
147 'secTok': secTok,
148 'timeStmp': timeStmp,
149 'service': 'youtube',
150 'hl': 'en_US',
151 }
152 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
153 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
154
155 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
156 tfa_results = self._download_webpage(
157 tfa_req, None,
158 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
159
160 if tfa_results is False:
161 return False
162
163 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
164 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
165 return False
166 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
167 self._downloader.report_warning('unable to log in - did the page structure change?')
168 return False
169 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
170 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
171 return False
172
173 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
174 self._downloader.report_warning('unable to log in: bad username or password')
175 return False
176 return True
177
178 def _real_initialize(self):
179 if self._downloader is None:
180 return
181 self._set_language()
182 if not self._login():
183 return
184
185
186class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
187 IE_DESC = 'YouTube.com'
188 _VALID_URL = r"""(?x)^
189 (
190 (?:https?://|//) # http(s):// or protocol-independent URL
191 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
192 (?:www\.)?deturl\.com/www\.youtube\.com/|
193 (?:www\.)?pwnyoutube\.com/|
194 (?:www\.)?yourepeat\.com/|
195 tube\.majestyc\.net/|
196 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
197 (?:.*?\#/)? # handle anchor (#/) redirect urls
198 (?: # the various things that can precede the ID:
199 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
200 |(?: # or the v= param in all its forms
201 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
202 (?:\?|\#!?) # the params delimiter ? or # or #!
203 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
204 v=
205 )
206 ))
207 |youtu\.be/ # just youtu.be/xxxx
208 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
209 )
210 )? # all until now is optional -> you can pass the naked ID
211 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
212 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
213 (?(1).+)? # if we found the ID, everything can follow
214 $"""
215 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
216 _formats = {
217 '5': {'ext': 'flv', 'width': 400, 'height': 240},
218 '6': {'ext': 'flv', 'width': 450, 'height': 270},
219 '13': {'ext': '3gp'},
220 '17': {'ext': '3gp', 'width': 176, 'height': 144},
221 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
222 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
223 '34': {'ext': 'flv', 'width': 640, 'height': 360},
224 '35': {'ext': 'flv', 'width': 854, 'height': 480},
225 '36': {'ext': '3gp', 'width': 320, 'height': 240},
226 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
227 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
228 '43': {'ext': 'webm', 'width': 640, 'height': 360},
229 '44': {'ext': 'webm', 'width': 854, 'height': 480},
230 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
231 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
232
233
234 # 3d videos
235 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
236 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
237 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
238 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
239 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
240 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
241 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
242
243 # Apple HTTP Live Streaming
244 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
245 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
246 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
247 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
248 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
249 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
250 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
251
252 # DASH mp4 video
253 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
254 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
255 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
256 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
257 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
258 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
259 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
260 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
261 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
262 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
263 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
264
265 # Dash mp4 audio
266 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
267 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
268 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
269
270 # Dash webm
271 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
272 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
273 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
274 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
275 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
276 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
277 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
278 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
279 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
280 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
281 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
282 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
283 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
284 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
285 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
286 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
287 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
288 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
289 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
290
291 # Dash webm audio
292 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
293 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
294
295 # Dash webm audio with opus inside
296 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
297 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
298 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
299
300 # RTMP (unnamed)
301 '_rtmp': {'protocol': 'rtmp'},
302 }
303
304 IE_NAME = 'youtube'
305 _TESTS = [
306 {
307 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
308 'info_dict': {
309 'id': 'BaW_jenozKc',
310 'ext': 'mp4',
311 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
312 'uploader': 'Philipp Hagemeister',
313 'uploader_id': 'phihag',
314 'upload_date': '20121002',
315 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
316 'categories': ['Science & Technology'],
317 'like_count': int,
318 'dislike_count': int,
319 }
320 },
321 {
322 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
323 'note': 'Test generic use_cipher_signature video (#897)',
324 'info_dict': {
325 'id': 'UxxajLWwzqY',
326 'ext': 'mp4',
327 'upload_date': '20120506',
328 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
329 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
330 'uploader': 'Icona Pop',
331 'uploader_id': 'IconaPop',
332 }
333 },
334 {
335 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
336 'note': 'Test VEVO video with age protection (#956)',
337 'info_dict': {
338 'id': '07FYdnEawAQ',
339 'ext': 'mp4',
340 'upload_date': '20130703',
341 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
342 'description': 'md5:64249768eec3bc4276236606ea996373',
343 'uploader': 'justintimberlakeVEVO',
344 'uploader_id': 'justintimberlakeVEVO',
345 }
346 },
347 {
348 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
349 'note': 'Embed-only video (#1746)',
350 'info_dict': {
351 'id': 'yZIXLfi8CZQ',
352 'ext': 'mp4',
353 'upload_date': '20120608',
354 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
355 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
356 'uploader': 'SET India',
357 'uploader_id': 'setindia'
358 }
359 },
360 {
361 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
362 'note': '256k DASH audio (format 141) via DASH manifest',
363 'info_dict': {
364 'id': 'a9LDPn-MO4I',
365 'ext': 'm4a',
366 'upload_date': '20121002',
367 'uploader_id': '8KVIDEO',
368 'description': '',
369 'uploader': '8KVIDEO',
370 'title': 'UHDTV TEST 8K VIDEO.mp4'
371 },
372 'params': {
373 'youtube_include_dash_manifest': True,
374 'format': '141',
375 },
376 },
377 # DASH manifest with encrypted signature
378 {
379 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
380 'info_dict': {
381 'id': 'IB3lcPjvWLA',
382 'ext': 'm4a',
383 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
384 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
385 'uploader': 'AfrojackVEVO',
386 'uploader_id': 'AfrojackVEVO',
387 'upload_date': '20131011',
388 },
389 'params': {
390 'youtube_include_dash_manifest': True,
391 'format': '141',
392 },
393 },
394 # Controversy video
395 {
396 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
397 'info_dict': {
398 'id': 'T4XJQO3qol8',
399 'ext': 'mp4',
400 'upload_date': '20100909',
401 'uploader': 'The Amazing Atheist',
402 'uploader_id': 'TheAmazingAtheist',
403 'title': 'Burning Everyone\'s Koran',
404 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
405 }
406 },
407 # Normal age-gate video (No vevo, embed allowed)
408 {
409 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
410 'info_dict': {
411 'id': 'HtVdAasjOgU',
412 'ext': 'mp4',
413 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
414 'description': 'md5:eca57043abae25130f58f655ad9a7771',
415 'uploader': 'The Witcher',
416 'uploader_id': 'WitcherGame',
417 'upload_date': '20140605',
418 },
419 },
420 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
421 {
422 'url': '__2ABJjxzNo',
423 'info_dict': {
424 'id': '__2ABJjxzNo',
425 'ext': 'mp4',
426 'upload_date': '20100430',
427 'uploader_id': 'deadmau5',
428 'description': 'md5:12c56784b8032162bb936a5f76d55360',
429 'uploader': 'deadmau5',
430 'title': 'Deadmau5 - Some Chords (HD)',
431 },
432 'expected_warnings': [
433 'DASH manifest missing',
434 ]
435 }
436 ]
437
438 def __init__(self, *args, **kwargs):
439 super(YoutubeIE, self).__init__(*args, **kwargs)
440 self._player_cache = {}
441
442 def report_video_info_webpage_download(self, video_id):
443 """Report attempt to download video info webpage."""
444 self.to_screen('%s: Downloading video info webpage' % video_id)
445
446 def report_information_extraction(self, video_id):
447 """Report attempt to extract video information."""
448 self.to_screen('%s: Extracting video information' % video_id)
449
450 def report_unavailable_format(self, video_id, format):
451 """Report extracted video URL."""
452 self.to_screen('%s: Format %s not available' % (video_id, format))
453
454 def report_rtmp_download(self):
455 """Indicate the download will use the RTMP protocol."""
456 self.to_screen('RTMP download detected')
457
458 def _signature_cache_id(self, example_sig):
459 """ Return a string representation of a signature """
460 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
461
462 def _extract_signature_function(self, video_id, player_url, example_sig):
463 id_m = re.match(
464 r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
465 player_url)
466 if not id_m:
467 raise ExtractorError('Cannot identify player %r' % player_url)
468 player_type = id_m.group('ext')
469 player_id = id_m.group('id')
470
471 # Read from filesystem cache
472 func_id = '%s_%s_%s' % (
473 player_type, player_id, self._signature_cache_id(example_sig))
474 assert os.path.basename(func_id) == func_id
475
476 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
477 if cache_spec is not None:
478 return lambda s: ''.join(s[i] for i in cache_spec)
479
480 if player_type == 'js':
481 code = self._download_webpage(
482 player_url, video_id,
483 note='Downloading %s player %s' % (player_type, player_id),
484 errnote='Download of %s failed' % player_url)
485 res = self._parse_sig_js(code)
486 elif player_type == 'swf':
487 urlh = self._request_webpage(
488 player_url, video_id,
489 note='Downloading %s player %s' % (player_type, player_id),
490 errnote='Download of %s failed' % player_url)
491 code = urlh.read()
492 res = self._parse_sig_swf(code)
493 else:
494 assert False, 'Invalid player type %r' % player_type
495
496 if cache_spec is None:
497 test_string = ''.join(map(compat_chr, range(len(example_sig))))
498 cache_res = res(test_string)
499 cache_spec = [ord(c) for c in cache_res]
500
501 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
502 return res
503
504 def _print_sig_code(self, func, example_sig):
505 def gen_sig_code(idxs):
506 def _genslice(start, end, step):
507 starts = '' if start == 0 else str(start)
508 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
509 steps = '' if step == 1 else (':%d' % step)
510 return 's[%s%s%s]' % (starts, ends, steps)
511
512 step = None
513 start = '(Never used)' # Quelch pyflakes warnings - start will be
514 # set as soon as step is set
515 for i, prev in zip(idxs[1:], idxs[:-1]):
516 if step is not None:
517 if i - prev == step:
518 continue
519 yield _genslice(start, prev, step)
520 step = None
521 continue
522 if i - prev in [-1, 1]:
523 step = i - prev
524 start = prev
525 continue
526 else:
527 yield 's[%d]' % prev
528 if step is None:
529 yield 's[%d]' % i
530 else:
531 yield _genslice(start, i, step)
532
533 test_string = ''.join(map(compat_chr, range(len(example_sig))))
534 cache_res = func(test_string)
535 cache_spec = [ord(c) for c in cache_res]
536 expr_code = ' + '.join(gen_sig_code(cache_spec))
537 signature_id_tuple = '(%s)' % (
538 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
539 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
540 ' return %s\n') % (signature_id_tuple, expr_code)
541 self.to_screen('Extracted signature function:\n' + code)
542
543 def _parse_sig_js(self, jscode):
544 funcname = self._search_regex(
545 r'\.sig\|\|([a-zA-Z0-9]+)\(', jscode,
546 'Initial JS player signature function name')
547
548 jsi = JSInterpreter(jscode)
549 initial_function = jsi.extract_function(funcname)
550 return lambda s: initial_function([s])
551
552 def _parse_sig_swf(self, file_contents):
553 swfi = SWFInterpreter(file_contents)
554 TARGET_CLASSNAME = 'SignatureDecipher'
555 searched_class = swfi.extract_class(TARGET_CLASSNAME)
556 initial_function = swfi.extract_function(searched_class, 'decipher')
557 return lambda s: initial_function([s])
558
559 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
560 """Turn the encrypted s field into a working signature"""
561
562 if player_url is None:
563 raise ExtractorError('Cannot decrypt signature without player_url')
564
565 if player_url.startswith('//'):
566 player_url = 'https:' + player_url
567 try:
568 player_id = (player_url, self._signature_cache_id(s))
569 if player_id not in self._player_cache:
570 func = self._extract_signature_function(
571 video_id, player_url, s
572 )
573 self._player_cache[player_id] = func
574 func = self._player_cache[player_id]
575 if self._downloader.params.get('youtube_print_sig_code'):
576 self._print_sig_code(func, s)
577 return func(s)
578 except Exception as e:
579 tb = traceback.format_exc()
580 raise ExtractorError(
581 'Signature extraction failed: ' + tb, cause=e)
582
583 def _get_available_subtitles(self, video_id, webpage):
584 try:
585 sub_list = self._download_webpage(
586 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
587 video_id, note=False)
588 except ExtractorError as err:
589 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
590 return {}
591 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
592
593 sub_lang_list = {}
594 for l in lang_list:
595 lang = l[1]
596 if lang in sub_lang_list:
597 continue
598 params = compat_urllib_parse.urlencode({
599 'lang': lang,
600 'v': video_id,
601 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
602 'name': unescapeHTML(l[0]).encode('utf-8'),
603 })
604 url = 'https://www.youtube.com/api/timedtext?' + params
605 sub_lang_list[lang] = url
606 if not sub_lang_list:
607 self._downloader.report_warning('video doesn\'t have subtitles')
608 return {}
609 return sub_lang_list
610
611 def _get_available_automatic_caption(self, video_id, webpage):
612 """We need the webpage for getting the captions url, pass it as an
613 argument to speed up the process."""
614 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
615 self.to_screen('%s: Looking for automatic captions' % video_id)
616 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
617 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
618 if mobj is None:
619 self._downloader.report_warning(err_msg)
620 return {}
621 player_config = json.loads(mobj.group(1))
622 try:
623 args = player_config['args']
624 caption_url = args['ttsurl']
625 timestamp = args['timestamp']
626 # We get the available subtitles
627 list_params = compat_urllib_parse.urlencode({
628 'type': 'list',
629 'tlangs': 1,
630 'asrs': 1,
631 })
632 list_url = caption_url + '&' + list_params
633 caption_list = self._download_xml(list_url, video_id)
634 original_lang_node = caption_list.find('track')
635 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr':
636 self._downloader.report_warning('Video doesn\'t have automatic captions')
637 return {}
638 original_lang = original_lang_node.attrib['lang_code']
639
640 sub_lang_list = {}
641 for lang_node in caption_list.findall('target'):
642 sub_lang = lang_node.attrib['lang_code']
643 params = compat_urllib_parse.urlencode({
644 'lang': original_lang,
645 'tlang': sub_lang,
646 'fmt': sub_format,
647 'ts': timestamp,
648 'kind': 'asr',
649 })
650 sub_lang_list[sub_lang] = caption_url + '&' + params
651 return sub_lang_list
652 # An extractor error can be raise by the download process if there are
653 # no automatic captions but there are subtitles
654 except (KeyError, ExtractorError):
655 self._downloader.report_warning(err_msg)
656 return {}
657
658 @classmethod
659 def extract_id(cls, url):
660 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
661 if mobj is None:
662 raise ExtractorError('Invalid URL: %s' % url)
663 video_id = mobj.group(2)
664 return video_id
665
666 def _extract_from_m3u8(self, manifest_url, video_id):
667 url_map = {}
668
669 def _get_urls(_manifest):
670 lines = _manifest.split('\n')
671 urls = filter(lambda l: l and not l.startswith('#'),
672 lines)
673 return urls
674 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
675 formats_urls = _get_urls(manifest)
676 for format_url in formats_urls:
677 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
678 url_map[itag] = format_url
679 return url_map
680
681 def _extract_annotations(self, video_id):
682 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
683 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
684
685 def _parse_dash_manifest(
686 self, video_id, dash_manifest_url, player_url, age_gate):
687 def decrypt_sig(mobj):
688 s = mobj.group(1)
689 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
690 return '/signature/%s' % dec_s
691 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
692 dash_doc = self._download_xml(
693 dash_manifest_url, video_id,
694 note='Downloading DASH manifest',
695 errnote='Could not download DASH manifest')
696
697 formats = []
698 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
699 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
700 if url_el is None:
701 continue
702 format_id = r.attrib['id']
703 video_url = url_el.text
704 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
705 f = {
706 'format_id': format_id,
707 'url': video_url,
708 'width': int_or_none(r.attrib.get('width')),
709 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
710 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
711 'filesize': filesize,
712 'fps': int_or_none(r.attrib.get('frameRate')),
713 }
714 try:
715 existing_format = next(
716 fo for fo in formats
717 if fo['format_id'] == format_id)
718 except StopIteration:
719 f.update(self._formats.get(format_id, {}))
720 formats.append(f)
721 else:
722 existing_format.update(f)
723 return formats
724
725 def _real_extract(self, url):
726 proto = (
727 'http' if self._downloader.params.get('prefer_insecure', False)
728 else 'https')
729
730 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
731 mobj = re.search(self._NEXT_URL_RE, url)
732 if mobj:
733 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
734 video_id = self.extract_id(url)
735
736 # Get video webpage
737 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
738 video_webpage = self._download_webpage(url, video_id)
739
740 # Attempt to extract SWF player URL
741 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
742 if mobj is not None:
743 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
744 else:
745 player_url = None
746
747 # Get video info
748 if re.search(r'player-age-gate-content">', video_webpage) is not None:
749 age_gate = True
750 # We simulate the access to the video from www.youtube.com/v/{video_id}
751 # this can be viewed without login into Youtube
752 data = compat_urllib_parse.urlencode({
753 'video_id': video_id,
754 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
755 'sts': self._search_regex(
756 r'"sts"\s*:\s*(\d+)', video_webpage, 'sts', default=''),
757 })
758 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
759 video_info_webpage = self._download_webpage(
760 video_info_url, video_id,
761 note='Refetching age-gated info webpage',
762 errnote='unable to download video info webpage')
763 video_info = compat_parse_qs(video_info_webpage)
764 else:
765 age_gate = False
766 try:
767 # Try looking directly into the video webpage
768 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
769 if not mobj:
770 raise ValueError('Could not find ytplayer.config') # caught below
771 json_code = uppercase_escape(mobj.group(1))
772 ytplayer_config = json.loads(json_code)
773 args = ytplayer_config['args']
774 # Convert to the same format returned by compat_parse_qs
775 video_info = dict((k, [v]) for k, v in args.items())
776 if 'url_encoded_fmt_stream_map' not in args:
777 raise ValueError('No stream_map present') # caught below
778 except ValueError:
779 # We fallback to the get_video_info pages (used by the embed page)
780 self.report_video_info_webpage_download(video_id)
781 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
782 video_info_url = (
783 '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
784 % (proto, video_id, el_type))
785 video_info_webpage = self._download_webpage(
786 video_info_url,
787 video_id, note=False,
788 errnote='unable to download video info webpage')
789 video_info = compat_parse_qs(video_info_webpage)
790 if 'token' in video_info:
791 break
792 if 'token' not in video_info:
793 if 'reason' in video_info:
794 raise ExtractorError(
795 'YouTube said: %s' % video_info['reason'][0],
796 expected=True, video_id=video_id)
797 else:
798 raise ExtractorError(
799 '"token" parameter not in video info for unknown reason',
800 video_id=video_id)
801
802 if 'view_count' in video_info:
803 view_count = int(video_info['view_count'][0])
804 else:
805 view_count = None
806
807 # Check for "rental" videos
808 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
809 raise ExtractorError('"rental" videos not supported')
810
811 # Start extracting information
812 self.report_information_extraction(video_id)
813
814 # uploader
815 if 'author' not in video_info:
816 raise ExtractorError('Unable to extract uploader name')
817 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
818
819 # uploader_id
820 video_uploader_id = None
821 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
822 if mobj is not None:
823 video_uploader_id = mobj.group(1)
824 else:
825 self._downloader.report_warning('unable to extract uploader nickname')
826
827 # title
828 if 'title' in video_info:
829 video_title = video_info['title'][0]
830 else:
831 self._downloader.report_warning('Unable to extract video title')
832 video_title = '_'
833
834 # thumbnail image
835 # We try first to get a high quality image:
836 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
837 video_webpage, re.DOTALL)
838 if m_thumb is not None:
839 video_thumbnail = m_thumb.group(1)
840 elif 'thumbnail_url' not in video_info:
841 self._downloader.report_warning('unable to extract video thumbnail')
842 video_thumbnail = None
843 else: # don't panic if we can't find it
844 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
845
846 # upload date
847 upload_date = None
848 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
849 if mobj is None:
850 mobj = re.search(
851 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
852 video_webpage)
853 if mobj is not None:
854 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
855 upload_date = unified_strdate(upload_date)
856
857 m_cat_container = self._search_regex(
858 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
859 video_webpage, 'categories', fatal=False)
860 if m_cat_container:
861 category = self._html_search_regex(
862 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
863 default=None)
864 video_categories = None if category is None else [category]
865 else:
866 video_categories = None
867
868 # description
869 video_description = get_element_by_id("eow-description", video_webpage)
870 if video_description:
871 video_description = re.sub(r'''(?x)
872 <a\s+
873 (?:[a-zA-Z-]+="[^"]+"\s+)*?
874 title="([^"]+)"\s+
875 (?:[a-zA-Z-]+="[^"]+"\s+)*?
876 class="yt-uix-redirect-link"\s*>
877 [^<]+
878 </a>
879 ''', r'\1', video_description)
880 video_description = clean_html(video_description)
881 else:
882 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
883 if fd_mobj:
884 video_description = unescapeHTML(fd_mobj.group(1))
885 else:
886 video_description = ''
887
888 def _extract_count(count_name):
889 count = self._search_regex(
890 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
891 video_webpage, count_name, default=None)
892 if count is not None:
893 return int(count.replace(',', ''))
894 return None
895 like_count = _extract_count('like')
896 dislike_count = _extract_count('dislike')
897
898 # subtitles
899 video_subtitles = self.extract_subtitles(video_id, video_webpage)
900
901 if self._downloader.params.get('listsubtitles', False):
902 self._list_available_subtitles(video_id, video_webpage)
903 return
904
905 if 'length_seconds' not in video_info:
906 self._downloader.report_warning('unable to extract video duration')
907 video_duration = None
908 else:
909 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
910
911 # annotations
912 video_annotations = None
913 if self._downloader.params.get('writeannotations', False):
914 video_annotations = self._extract_annotations(video_id)
915
916 def _map_to_format_list(urlmap):
917 formats = []
918 for itag, video_real_url in urlmap.items():
919 dct = {
920 'format_id': itag,
921 'url': video_real_url,
922 'player_url': player_url,
923 }
924 if itag in self._formats:
925 dct.update(self._formats[itag])
926 formats.append(dct)
927 return formats
928
929 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
930 self.report_rtmp_download()
931 formats = [{
932 'format_id': '_rtmp',
933 'protocol': 'rtmp',
934 'url': video_info['conn'][0],
935 'player_url': player_url,
936 }]
937 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
938 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
939 if 'rtmpe%3Dyes' in encoded_url_map:
940 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
941 url_map = {}
942 for url_data_str in encoded_url_map.split(','):
943 url_data = compat_parse_qs(url_data_str)
944 if 'itag' not in url_data or 'url' not in url_data:
945 continue
946 format_id = url_data['itag'][0]
947 url = url_data['url'][0]
948
949 if 'sig' in url_data:
950 url += '&signature=' + url_data['sig'][0]
951 elif 's' in url_data:
952 encrypted_sig = url_data['s'][0]
953
954 if not age_gate:
955 jsplayer_url_json = self._search_regex(
956 r'"assets":.+?"js":\s*("[^"]+")',
957 video_webpage, 'JS player URL')
958 player_url = json.loads(jsplayer_url_json)
959 if player_url is None:
960 player_url_json = self._search_regex(
961 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
962 video_webpage, 'age gate player URL')
963 player_url = json.loads(player_url_json)
964
965 if self._downloader.params.get('verbose'):
966 if player_url is None:
967 player_version = 'unknown'
968 player_desc = 'unknown'
969 else:
970 if player_url.endswith('swf'):
971 player_version = self._search_regex(
972 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
973 'flash player', fatal=False)
974 player_desc = 'flash player %s' % player_version
975 else:
976 player_version = self._search_regex(
977 r'html5player-([^/]+?)(?:/html5player)?\.js',
978 player_url,
979 'html5 player', fatal=False)
980 player_desc = 'html5 player %s' % player_version
981
982 parts_sizes = self._signature_cache_id(encrypted_sig)
983 self.to_screen('{%s} signature length %s, %s' %
984 (format_id, parts_sizes, player_desc))
985
986 signature = self._decrypt_signature(
987 encrypted_sig, video_id, player_url, age_gate)
988 url += '&signature=' + signature
989 if 'ratebypass' not in url:
990 url += '&ratebypass=yes'
991 url_map[format_id] = url
992 formats = _map_to_format_list(url_map)
993 elif video_info.get('hlsvp'):
994 manifest_url = video_info['hlsvp'][0]
995 url_map = self._extract_from_m3u8(manifest_url, video_id)
996 formats = _map_to_format_list(url_map)
997 else:
998 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
999
1000 # Look for the DASH manifest
1001 if self._downloader.params.get('youtube_include_dash_manifest', True):
1002 dash_mpd = video_info.get('dashmpd')
1003 if not dash_mpd:
1004 self.report_warning('%s: DASH manifest missing' % video_id)
1005 else:
1006 dash_manifest_url = dash_mpd[0]
1007 try:
1008 dash_formats = self._parse_dash_manifest(
1009 video_id, dash_manifest_url, player_url, age_gate)
1010 except (ExtractorError, KeyError) as e:
1011 self.report_warning(
1012 'Skipping DASH manifest: %r' % e, video_id)
1013 else:
1014 formats.extend(dash_formats)
1015
1016 self._sort_formats(formats)
1017
1018 return {
1019 'id': video_id,
1020 'uploader': video_uploader,
1021 'uploader_id': video_uploader_id,
1022 'upload_date': upload_date,
1023 'title': video_title,
1024 'thumbnail': video_thumbnail,
1025 'description': video_description,
1026 'categories': video_categories,
1027 'subtitles': video_subtitles,
1028 'duration': video_duration,
1029 'age_limit': 18 if age_gate else 0,
1030 'annotations': video_annotations,
1031 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1032 'view_count': view_count,
1033 'like_count': like_count,
1034 'dislike_count': dislike_count,
1035 'formats': formats,
1036 }
1037
1038
1039class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1040 IE_DESC = 'YouTube.com playlists'
1041 _VALID_URL = r"""(?x)(?:
1042 (?:https?://)?
1043 (?:\w+\.)?
1044 youtube\.com/
1045 (?:
1046 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
1047 \? (?:.*?&)*? (?:p|a|list)=
1048 | p/
1049 )
1050 (
1051 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1052 # Top tracks, they can also include dots
1053 |(?:MC)[\w\.]*
1054 )
1055 .*
1056 |
1057 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1058 )"""
1059 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1060 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1061 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1062 IE_NAME = 'youtube:playlist'
1063 _TESTS = [{
1064 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1065 'info_dict': {
1066 'title': 'ytdl test PL',
1067 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1068 },
1069 'playlist_count': 3,
1070 }, {
1071 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1072 'info_dict': {
1073 'title': 'YDL_Empty_List',
1074 },
1075 'playlist_count': 0,
1076 }, {
1077 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1078 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1079 'info_dict': {
1080 'title': '29C3: Not my department',
1081 },
1082 'playlist_count': 95,
1083 }, {
1084 'note': 'issue #673',
1085 'url': 'PLBB231211A4F62143',
1086 'info_dict': {
1087 'title': '[OLD]Team Fortress 2 (Class-based LP)',
1088 },
1089 'playlist_mincount': 26,
1090 }, {
1091 'note': 'Large playlist',
1092 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1093 'info_dict': {
1094 'title': 'Uploads from Cauchemar',
1095 },
1096 'playlist_mincount': 799,
1097 }, {
1098 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1099 'info_dict': {
1100 'title': 'YDL_safe_search',
1101 },
1102 'playlist_count': 2,
1103 }, {
1104 'note': 'embedded',
1105 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1106 'playlist_count': 4,
1107 'info_dict': {
1108 'title': 'JODA15',
1109 }
1110 }, {
1111 'note': 'Embedded SWF player',
1112 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1113 'playlist_count': 4,
1114 'info_dict': {
1115 'title': 'JODA7',
1116 }
1117 }]
1118
1119 def _real_initialize(self):
1120 self._login()
1121
1122 def _ids_to_results(self, ids):
1123 return [
1124 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1125 for vid_id in ids]
1126
1127 def _extract_mix(self, playlist_id):
1128 # The mixes are generated from a a single video
1129 # the id of the playlist is just 'RD' + video_id
1130 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1131 webpage = self._download_webpage(
1132 url, playlist_id, 'Downloading Youtube mix')
1133 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1134 title_span = (
1135 search_title('playlist-title') or
1136 search_title('title long-title') or
1137 search_title('title'))
1138 title = clean_html(title_span)
1139 ids = orderedSet(re.findall(
1140 r'''(?xs)data-video-username=".*?".*?
1141 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1142 webpage))
1143 url_results = self._ids_to_results(ids)
1144
1145 return self.playlist_result(url_results, playlist_id, title)
1146
1147 def _real_extract(self, url):
1148 # Extract playlist id
1149 mobj = re.match(self._VALID_URL, url)
1150 if mobj is None:
1151 raise ExtractorError('Invalid URL: %s' % url)
1152 playlist_id = mobj.group(1) or mobj.group(2)
1153
1154 # Check if it's a video-specific URL
1155 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1156 if 'v' in query_dict:
1157 video_id = query_dict['v'][0]
1158 if self._downloader.params.get('noplaylist'):
1159 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1160 return self.url_result(video_id, 'Youtube', video_id=video_id)
1161 else:
1162 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1163
1164 if playlist_id.startswith('RD'):
1165 # Mixes require a custom extraction process
1166 return self._extract_mix(playlist_id)
1167 if playlist_id.startswith('TL'):
1168 raise ExtractorError('For downloading YouTube.com top lists, use '
1169 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1170
1171 url = self._TEMPLATE_URL % playlist_id
1172 page = self._download_webpage(url, playlist_id)
1173 more_widget_html = content_html = page
1174
1175 # Check if the playlist exists or is private
1176 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1177 raise ExtractorError(
1178 'The playlist doesn\'t exist or is private, use --username or '
1179 '--netrc to access it.',
1180 expected=True)
1181
1182 # Extract the video ids from the playlist pages
1183 ids = []
1184
1185 for page_num in itertools.count(1):
1186 matches = re.finditer(self._VIDEO_RE, content_html)
1187 # We remove the duplicates and the link with index 0
1188 # (it's not the first video of the playlist)
1189 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1190 ids.extend(new_ids)
1191
1192 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1193 if not mobj:
1194 break
1195
1196 more = self._download_json(
1197 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1198 'Downloading page #%s' % page_num,
1199 transform_source=uppercase_escape)
1200 content_html = more['content_html']
1201 more_widget_html = more['load_more_widget_html']
1202
1203 playlist_title = self._html_search_regex(
1204 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1205 page, 'title')
1206
1207 url_results = self._ids_to_results(ids)
1208 return self.playlist_result(url_results, playlist_id, playlist_title)
1209
1210
1211class YoutubeTopListIE(YoutubePlaylistIE):
1212 IE_NAME = 'youtube:toplist'
1213 IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1214 ' (Example: "yttoplist:music:Top Tracks")')
1215 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1216 _TESTS = [{
1217 'url': 'yttoplist:music:Trending',
1218 'playlist_mincount': 5,
1219 'skip': 'Only works for logged-in users',
1220 }]
1221
1222 def _real_extract(self, url):
1223 mobj = re.match(self._VALID_URL, url)
1224 channel = mobj.group('chann')
1225 title = mobj.group('title')
1226 query = compat_urllib_parse.urlencode({'title': title})
1227 channel_page = self._download_webpage(
1228 'https://www.youtube.com/%s' % channel, title)
1229 link = self._html_search_regex(
1230 r'''(?x)
1231 <a\s+href="([^"]+)".*?>\s*
1232 <span\s+class="branded-page-module-title-text">\s*
1233 <span[^>]*>.*?%s.*?</span>''' % re.escape(query),
1234 channel_page, 'list')
1235 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1236
1237 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1238 ids = []
1239 # sometimes the webpage doesn't contain the videos
1240 # retry until we get them
1241 for i in itertools.count(0):
1242 msg = 'Downloading Youtube mix'
1243 if i > 0:
1244 msg += ', retry #%d' % i
1245
1246 webpage = self._download_webpage(url, title, msg)
1247 ids = orderedSet(re.findall(video_re, webpage))
1248 if ids:
1249 break
1250 url_results = self._ids_to_results(ids)
1251 return self.playlist_result(url_results, playlist_title=title)
1252
1253
1254class YoutubeChannelIE(InfoExtractor):
1255 IE_DESC = 'YouTube.com channels'
1256 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
1257 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1258 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1259 IE_NAME = 'youtube:channel'
1260 _TESTS = [{
1261 'note': 'paginated channel',
1262 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1263 'playlist_mincount': 91,
1264 }]
1265
1266 def extract_videos_from_page(self, page):
1267 ids_in_page = []
1268 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1269 if mobj.group(1) not in ids_in_page:
1270 ids_in_page.append(mobj.group(1))
1271 return ids_in_page
1272
1273 def _real_extract(self, url):
1274 channel_id = self._match_id(url)
1275
1276 video_ids = []
1277 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1278 channel_page = self._download_webpage(url, channel_id)
1279 autogenerated = re.search(r'''(?x)
1280 class="[^"]*?(?:
1281 channel-header-autogenerated-label|
1282 yt-channel-title-autogenerated
1283 )[^"]*"''', channel_page) is not None
1284
1285 if autogenerated:
1286 # The videos are contained in a single page
1287 # the ajax pages can't be used, they are empty
1288 video_ids = self.extract_videos_from_page(channel_page)
1289 entries = [
1290 self.url_result(video_id, 'Youtube', video_id=video_id)
1291 for video_id in video_ids]
1292 return self.playlist_result(entries, channel_id)
1293
1294 def _entries():
1295 for pagenum in itertools.count(1):
1296 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1297 page = self._download_json(
1298 url, channel_id, note='Downloading page #%s' % pagenum,
1299 transform_source=uppercase_escape)
1300
1301 ids_in_page = self.extract_videos_from_page(page['content_html'])
1302 for video_id in ids_in_page:
1303 yield self.url_result(
1304 video_id, 'Youtube', video_id=video_id)
1305
1306 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1307 break
1308
1309 return self.playlist_result(_entries(), channel_id)
1310
1311
1312class YoutubeUserIE(InfoExtractor):
1313 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
1314 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
1315 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1316 _GDATA_PAGE_SIZE = 50
1317 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1318 IE_NAME = 'youtube:user'
1319
1320 _TESTS = [{
1321 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1322 'playlist_mincount': 320,
1323 'info_dict': {
1324 'title': 'TheLinuxFoundation',
1325 }
1326 }, {
1327 'url': 'ytuser:phihag',
1328 'only_matching': True,
1329 }]
1330
1331 @classmethod
1332 def suitable(cls, url):
1333 # Don't return True if the url can be extracted with other youtube
1334 # extractor, the regex would is too permissive and it would match.
1335 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1336 if any(ie.suitable(url) for ie in other_ies):
1337 return False
1338 else:
1339 return super(YoutubeUserIE, cls).suitable(url)
1340
1341 def _real_extract(self, url):
1342 username = self._match_id(url)
1343
1344 # Download video ids using YouTube Data API. Result size per
1345 # query is limited (currently to 50 videos) so we need to query
1346 # page by page until there are no video ids - it means we got
1347 # all of them.
1348
1349 def download_page(pagenum):
1350 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1351
1352 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1353 page = self._download_webpage(
1354 gdata_url, username,
1355 'Downloading video ids from %d to %d' % (
1356 start_index, start_index + self._GDATA_PAGE_SIZE))
1357
1358 try:
1359 response = json.loads(page)
1360 except ValueError as err:
1361 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
1362 if 'entry' not in response['feed']:
1363 return
1364
1365 # Extract video identifiers
1366 entries = response['feed']['entry']
1367 for entry in entries:
1368 title = entry['title']['$t']
1369 video_id = entry['id']['$t'].split('/')[-1]
1370 yield {
1371 '_type': 'url',
1372 'url': video_id,
1373 'ie_key': 'Youtube',
1374 'id': video_id,
1375 'title': title,
1376 }
1377 url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
1378
1379 return self.playlist_result(url_results, playlist_title=username)
1380
1381
1382class YoutubeSearchIE(SearchInfoExtractor):
1383 IE_DESC = 'YouTube.com searches'
1384 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1385 _MAX_RESULTS = 1000
1386 IE_NAME = 'youtube:search'
1387 _SEARCH_KEY = 'ytsearch'
1388
1389 def _get_n_results(self, query, n):
1390 """Get a specified number of results for a query"""
1391
1392 video_ids = []
1393 pagenum = 0
1394 limit = n
1395 PAGE_SIZE = 50
1396
1397 while (PAGE_SIZE * pagenum) < limit:
1398 result_url = self._API_URL % (
1399 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1400 (PAGE_SIZE * pagenum) + 1)
1401 data_json = self._download_webpage(
1402 result_url, video_id='query "%s"' % query,
1403 note='Downloading page %s' % (pagenum + 1),
1404 errnote='Unable to download API page')
1405 data = json.loads(data_json)
1406 api_response = data['data']
1407
1408 if 'items' not in api_response:
1409 raise ExtractorError(
1410 '[youtube] No video results', expected=True)
1411
1412 new_ids = list(video['id'] for video in api_response['items'])
1413 video_ids += new_ids
1414
1415 limit = min(n, api_response['totalItems'])
1416 pagenum += 1
1417
1418 if len(video_ids) > n:
1419 video_ids = video_ids[:n]
1420 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1421 for video_id in video_ids]
1422 return self.playlist_result(videos, query)
1423
1424
1425class YoutubeSearchDateIE(YoutubeSearchIE):
1426 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1427 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1428 _SEARCH_KEY = 'ytsearchdate'
1429 IE_DESC = 'YouTube.com searches, newest videos first'
1430
1431
1432class YoutubeSearchURLIE(InfoExtractor):
1433 IE_DESC = 'YouTube.com search URLs'
1434 IE_NAME = 'youtube:search_url'
1435 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1436 _TESTS = [{
1437 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1438 'playlist_mincount': 5,
1439 'info_dict': {
1440 'title': 'youtube-dl test video',
1441 }
1442 }]
1443
1444 def _real_extract(self, url):
1445 mobj = re.match(self._VALID_URL, url)
1446 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1447
1448 webpage = self._download_webpage(url, query)
1449 result_code = self._search_regex(
1450 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
1451
1452 part_codes = re.findall(
1453 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1454 entries = []
1455 for part_code in part_codes:
1456 part_title = self._html_search_regex(
1457 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1458 part_url_snippet = self._html_search_regex(
1459 r'(?s)href="([^"]+)"', part_code, 'item URL')
1460 part_url = compat_urlparse.urljoin(
1461 'https://www.youtube.com/', part_url_snippet)
1462 entries.append({
1463 '_type': 'url',
1464 'url': part_url,
1465 'title': part_title,
1466 })
1467
1468 return {
1469 '_type': 'playlist',
1470 'entries': entries,
1471 'title': query,
1472 }
1473
1474
1475class YoutubeShowIE(InfoExtractor):
1476 IE_DESC = 'YouTube.com (multi-season) shows'
1477 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
1478 IE_NAME = 'youtube:show'
1479 _TESTS = [{
1480 'url': 'http://www.youtube.com/show/airdisasters',
1481 'playlist_mincount': 3,
1482 'info_dict': {
1483 'id': 'airdisasters',
1484 'title': 'Air Disasters',
1485 }
1486 }]
1487
1488 def _real_extract(self, url):
1489 mobj = re.match(self._VALID_URL, url)
1490 playlist_id = mobj.group('id')
1491 webpage = self._download_webpage(
1492 url, playlist_id, 'Downloading show webpage')
1493 # There's one playlist for each season of the show
1494 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1495 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1496 entries = [
1497 self.url_result(
1498 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1499 for season in m_seasons
1500 ]
1501 title = self._og_search_title(webpage, fatal=False)
1502
1503 return {
1504 '_type': 'playlist',
1505 'id': playlist_id,
1506 'title': title,
1507 'entries': entries,
1508 }
1509
1510
1511class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1512 """
1513 Base class for extractors that fetch info from
1514 http://www.youtube.com/feed_ajax
1515 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1516 """
1517 _LOGIN_REQUIRED = True
1518 # use action_load_personal_feed instead of action_load_system_feed
1519 _PERSONAL_FEED = False
1520
1521 @property
1522 def _FEED_TEMPLATE(self):
1523 action = 'action_load_system_feed'
1524 if self._PERSONAL_FEED:
1525 action = 'action_load_personal_feed'
1526 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1527
1528 @property
1529 def IE_NAME(self):
1530 return 'youtube:%s' % self._FEED_NAME
1531
1532 def _real_initialize(self):
1533 self._login()
1534
1535 def _real_extract(self, url):
1536 feed_entries = []
1537 paging = 0
1538 for i in itertools.count(1):
1539 info = self._download_json(self._FEED_TEMPLATE % paging,
1540 '%s feed' % self._FEED_NAME,
1541 'Downloading page %s' % i)
1542 feed_html = info.get('feed_html') or info.get('content_html')
1543 load_more_widget_html = info.get('load_more_widget_html') or feed_html
1544 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1545 ids = orderedSet(m.group(1) for m in m_ids)
1546 feed_entries.extend(
1547 self.url_result(video_id, 'Youtube', video_id=video_id)
1548 for video_id in ids)
1549 mobj = re.search(
1550 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1551 load_more_widget_html)
1552 if mobj is None:
1553 break
1554 paging = mobj.group('paging')
1555 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1556
1557
1558class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1559 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
1560 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1561 _FEED_NAME = 'recommended'
1562 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1563
1564
1565class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1566 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
1567 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1568 _FEED_NAME = 'watch_later'
1569 _PLAYLIST_TITLE = 'Youtube Watch Later'
1570 _PERSONAL_FEED = True
1571
1572
1573class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1574 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
1575 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1576 _FEED_NAME = 'history'
1577 _PERSONAL_FEED = True
1578 _PLAYLIST_TITLE = 'Youtube Watch History'
1579
1580
1581class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1582 IE_NAME = 'youtube:favorites'
1583 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
1584 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1585 _LOGIN_REQUIRED = True
1586
1587 def _real_extract(self, url):
1588 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1589 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
1590 return self.url_result(playlist_id, 'YoutubePlaylist')
1591
1592
1593class YoutubeSubscriptionsIE(YoutubePlaylistIE):
1594 IE_NAME = 'youtube:subscriptions'
1595 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1596 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1597 _TESTS = []
1598
1599 def _real_extract(self, url):
1600 title = 'Youtube Subscriptions'
1601 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1602
1603 # The extraction process is the same as for playlists, but the regex
1604 # for the video ids doesn't contain an index
1605 ids = []
1606 more_widget_html = content_html = page
1607
1608 for page_num in itertools.count(1):
1609 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1610 new_ids = orderedSet(matches)
1611 ids.extend(new_ids)
1612
1613 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1614 if not mobj:
1615 break
1616
1617 more = self._download_json(
1618 'https://youtube.com/%s' % mobj.group('more'), title,
1619 'Downloading page #%s' % page_num,
1620 transform_source=uppercase_escape)
1621 content_html = more['content_html']
1622 more_widget_html = more['load_more_widget_html']
1623
1624 return {
1625 '_type': 'playlist',
1626 'title': title,
1627 'entries': self._ids_to_results(ids),
1628 }
1629
1630
1631class YoutubeTruncatedURLIE(InfoExtractor):
1632 IE_NAME = 'youtube:truncated_url'
1633 IE_DESC = False # Do not list
1634 _VALID_URL = r'''(?x)
1635 (?:https?://)?[^/]+/watch\?(?:
1636 feature=[a-z_]+|
1637 annotation_id=annotation_[^&]+
1638 )?$|
1639 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1640 '''
1641
1642 _TESTS = [{
1643 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1644 'only_matching': True,
1645 }, {
1646 'url': 'http://www.youtube.com/watch?',
1647 'only_matching': True,
1648 }]
1649
1650 def _real_extract(self, url):
1651 raise ExtractorError(
1652 'Did you forget to quote the URL? Remember that & is a meta '
1653 'character in most shells, so you want to put the URL in quotes, '
1654 'like youtube-dl '
1655 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1656 ' or simply youtube-dl BaW_jenozKc .',
1657 expected=True)