]> jfr.im git - yt-dlp.git/blame_incremental - youtube_dl/extractor/youtube.py
[tvp] Update tests and improve output
[yt-dlp.git] / youtube_dl / extractor / youtube.py
... / ...
CommitLineData
1# coding: utf-8
2
3from __future__ import unicode_literals
4
5
6import itertools
7import json
8import os.path
9import re
10import time
11import traceback
12
13from .common import InfoExtractor, SearchInfoExtractor
14from .subtitles import SubtitlesInfoExtractor
15from ..jsinterp import JSInterpreter
16from ..swfinterp import SWFInterpreter
17from ..compat import (
18 compat_chr,
19 compat_parse_qs,
20 compat_urllib_parse,
21 compat_urllib_request,
22 compat_urlparse,
23 compat_str,
24)
25from ..utils import (
26 clean_html,
27 ExtractorError,
28 get_element_by_attribute,
29 get_element_by_id,
30 int_or_none,
31 OnDemandPagedList,
32 orderedSet,
33 unescapeHTML,
34 unified_strdate,
35 uppercase_escape,
36)
37
38
39class YoutubeBaseInfoExtractor(InfoExtractor):
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
43 _NETRC_MACHINE = 'youtube'
44 # If True it will raise an error if no login info is provided
45 _LOGIN_REQUIRED = False
46
47 def _set_language(self):
48 self._set_cookie(
49 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
50 # YouTube sets the expire time to about two months
51 expire_time=time.time() + 2 * 30 * 24 * 3600)
52
53 def _login(self):
54 """
55 Attempt to log in to YouTube.
56 True is returned if successful or skipped.
57 False is returned if login failed.
58
59 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
60 """
61 (username, password) = self._get_login_info()
62 # No authentication to be performed
63 if username is None:
64 if self._LOGIN_REQUIRED:
65 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
66 return True
67
68 login_page = self._download_webpage(
69 self._LOGIN_URL, None,
70 note='Downloading login page',
71 errnote='unable to fetch login page', fatal=False)
72 if login_page is False:
73 return
74
75 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
76 login_page, 'Login GALX parameter')
77
78 # Log in
79 login_form_strs = {
80 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
81 'Email': username,
82 'GALX': galx,
83 'Passwd': password,
84
85 'PersistentCookie': 'yes',
86 '_utf8': '霱',
87 'bgresponse': 'js_disabled',
88 'checkConnection': '',
89 'checkedDomains': 'youtube',
90 'dnConn': '',
91 'pstMsg': '0',
92 'rmShown': '1',
93 'secTok': '',
94 'signIn': 'Sign in',
95 'timeStmp': '',
96 'service': 'youtube',
97 'uilel': '3',
98 'hl': 'en_US',
99 }
100
101 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
102 # chokes on unicode
103 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
104 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
105
106 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
107 login_results = self._download_webpage(
108 req, None,
109 note='Logging in', errnote='unable to log in', fatal=False)
110 if login_results is False:
111 return False
112
113 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
114 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
115
116 # Two-Factor
117 # TODO add SMS and phone call support - these require making a request and then prompting the user
118
119 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
120 tfa_code = self._get_tfa_info()
121
122 if tfa_code is None:
123 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
124 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
125 return False
126
127 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
128
129 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
130 if match is None:
131 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
132 secTok = match.group(1)
133 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
134 if match is None:
135 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
136 timeStmp = match.group(1)
137
138 tfa_form_strs = {
139 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
140 'smsToken': '',
141 'smsUserPin': tfa_code,
142 'smsVerifyPin': 'Verify',
143
144 'PersistentCookie': 'yes',
145 'checkConnection': '',
146 'checkedDomains': 'youtube',
147 'pstMsg': '1',
148 'secTok': secTok,
149 'timeStmp': timeStmp,
150 'service': 'youtube',
151 'hl': 'en_US',
152 }
153 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
154 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
155
156 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
157 tfa_results = self._download_webpage(
158 tfa_req, None,
159 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
160
161 if tfa_results is False:
162 return False
163
164 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
165 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
166 return False
167 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
168 self._downloader.report_warning('unable to log in - did the page structure change?')
169 return False
170 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
171 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
172 return False
173
174 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
175 self._downloader.report_warning('unable to log in: bad username or password')
176 return False
177 return True
178
179 def _real_initialize(self):
180 if self._downloader is None:
181 return
182 self._set_language()
183 if not self._login():
184 return
185
186
187class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
188 IE_DESC = 'YouTube.com'
189 _VALID_URL = r"""(?x)^
190 (
191 (?:https?://|//) # http(s):// or protocol-independent URL
192 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
193 (?:www\.)?deturl\.com/www\.youtube\.com/|
194 (?:www\.)?pwnyoutube\.com/|
195 (?:www\.)?yourepeat\.com/|
196 tube\.majestyc\.net/|
197 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
198 (?:.*?\#/)? # handle anchor (#/) redirect urls
199 (?: # the various things that can precede the ID:
200 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
201 |(?: # or the v= param in all its forms
202 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
203 (?:\?|\#!?) # the params delimiter ? or # or #!
204 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
205 v=
206 )
207 ))
208 |youtu\.be/ # just youtu.be/xxxx
209 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
210 )
211 )? # all until now is optional -> you can pass the naked ID
212 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
213 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
214 (?(1).+)? # if we found the ID, everything can follow
215 $"""
216 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
217 _formats = {
218 '5': {'ext': 'flv', 'width': 400, 'height': 240},
219 '6': {'ext': 'flv', 'width': 450, 'height': 270},
220 '13': {'ext': '3gp'},
221 '17': {'ext': '3gp', 'width': 176, 'height': 144},
222 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
223 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
224 '34': {'ext': 'flv', 'width': 640, 'height': 360},
225 '35': {'ext': 'flv', 'width': 854, 'height': 480},
226 '36': {'ext': '3gp', 'width': 320, 'height': 240},
227 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
228 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
229 '43': {'ext': 'webm', 'width': 640, 'height': 360},
230 '44': {'ext': 'webm', 'width': 854, 'height': 480},
231 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
232 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
233
234
235 # 3d videos
236 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
237 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
238 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
239 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
240 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
241 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
242 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
243
244 # Apple HTTP Live Streaming
245 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
246 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
247 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
248 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
249 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
250 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
251 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
252
253 # DASH mp4 video
254 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
255 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
256 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
257 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
258 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
259 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
260 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
261 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
262 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
263 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
264 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
265
266 # Dash mp4 audio
267 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50},
268 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50},
269 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50},
270
271 # Dash webm
272 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
273 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
274 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
275 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
276 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
277 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
278 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
279 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
280 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
281 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
282 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
283 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
284 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
285 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
286 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
287 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
288 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
289 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
290 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
291 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
292 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
293
294 # Dash webm audio
295 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
296 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
297
298 # Dash webm audio with opus inside
299 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
300 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
301 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
302
303 # RTMP (unnamed)
304 '_rtmp': {'protocol': 'rtmp'},
305 }
306
307 IE_NAME = 'youtube'
308 _TESTS = [
309 {
310 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
311 'info_dict': {
312 'id': 'BaW_jenozKc',
313 'ext': 'mp4',
314 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
315 'uploader': 'Philipp Hagemeister',
316 'uploader_id': 'phihag',
317 'upload_date': '20121002',
318 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
319 'categories': ['Science & Technology'],
320 'like_count': int,
321 'dislike_count': int,
322 }
323 },
324 {
325 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
326 'note': 'Test generic use_cipher_signature video (#897)',
327 'info_dict': {
328 'id': 'UxxajLWwzqY',
329 'ext': 'mp4',
330 'upload_date': '20120506',
331 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
332 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
333 'uploader': 'Icona Pop',
334 'uploader_id': 'IconaPop',
335 }
336 },
337 {
338 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
339 'note': 'Test VEVO video with age protection (#956)',
340 'info_dict': {
341 'id': '07FYdnEawAQ',
342 'ext': 'mp4',
343 'upload_date': '20130703',
344 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
345 'description': 'md5:64249768eec3bc4276236606ea996373',
346 'uploader': 'justintimberlakeVEVO',
347 'uploader_id': 'justintimberlakeVEVO',
348 }
349 },
350 {
351 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
352 'note': 'Embed-only video (#1746)',
353 'info_dict': {
354 'id': 'yZIXLfi8CZQ',
355 'ext': 'mp4',
356 'upload_date': '20120608',
357 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
358 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
359 'uploader': 'SET India',
360 'uploader_id': 'setindia'
361 }
362 },
363 {
364 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
365 'note': '256k DASH audio (format 141) via DASH manifest',
366 'info_dict': {
367 'id': 'a9LDPn-MO4I',
368 'ext': 'm4a',
369 'upload_date': '20121002',
370 'uploader_id': '8KVIDEO',
371 'description': '',
372 'uploader': '8KVIDEO',
373 'title': 'UHDTV TEST 8K VIDEO.mp4'
374 },
375 'params': {
376 'youtube_include_dash_manifest': True,
377 'format': '141',
378 },
379 },
380 # DASH manifest with encrypted signature
381 {
382 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
383 'info_dict': {
384 'id': 'IB3lcPjvWLA',
385 'ext': 'm4a',
386 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
387 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
388 'uploader': 'AfrojackVEVO',
389 'uploader_id': 'AfrojackVEVO',
390 'upload_date': '20131011',
391 },
392 'params': {
393 'youtube_include_dash_manifest': True,
394 'format': '141',
395 },
396 },
397 # Controversy video
398 {
399 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
400 'info_dict': {
401 'id': 'T4XJQO3qol8',
402 'ext': 'mp4',
403 'upload_date': '20100909',
404 'uploader': 'The Amazing Atheist',
405 'uploader_id': 'TheAmazingAtheist',
406 'title': 'Burning Everyone\'s Koran',
407 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
408 }
409 },
410 # Normal age-gate video (No vevo, embed allowed)
411 {
412 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
413 'info_dict': {
414 'id': 'HtVdAasjOgU',
415 'ext': 'mp4',
416 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
417 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
418 'uploader': 'The Witcher',
419 'uploader_id': 'WitcherGame',
420 'upload_date': '20140605',
421 },
422 },
423 # Age-gate video with encrypted signature
424 {
425 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',
426 'info_dict': {
427 'id': '6kLq3WMV1nU',
428 'ext': 'mp4',
429 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
430 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
431 'uploader': 'LloydVEVO',
432 'uploader_id': 'LloydVEVO',
433 'upload_date': '20110629',
434 },
435 },
436 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
437 {
438 'url': '__2ABJjxzNo',
439 'info_dict': {
440 'id': '__2ABJjxzNo',
441 'ext': 'mp4',
442 'upload_date': '20100430',
443 'uploader_id': 'deadmau5',
444 'description': 'md5:12c56784b8032162bb936a5f76d55360',
445 'uploader': 'deadmau5',
446 'title': 'Deadmau5 - Some Chords (HD)',
447 },
448 'expected_warnings': [
449 'DASH manifest missing',
450 ]
451 },
452 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
453 {
454 'url': 'lqQg6PlCWgI',
455 'info_dict': {
456 'id': 'lqQg6PlCWgI',
457 'ext': 'mp4',
458 'upload_date': '20120731',
459 'uploader_id': 'olympic',
460 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
461 'uploader': 'Olympics',
462 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
463 },
464 'params': {
465 'skip_download': 'requires avconv',
466 }
467 },
468 ]
469
470 def __init__(self, *args, **kwargs):
471 super(YoutubeIE, self).__init__(*args, **kwargs)
472 self._player_cache = {}
473
474 def report_video_info_webpage_download(self, video_id):
475 """Report attempt to download video info webpage."""
476 self.to_screen('%s: Downloading video info webpage' % video_id)
477
478 def report_information_extraction(self, video_id):
479 """Report attempt to extract video information."""
480 self.to_screen('%s: Extracting video information' % video_id)
481
482 def report_unavailable_format(self, video_id, format):
483 """Report extracted video URL."""
484 self.to_screen('%s: Format %s not available' % (video_id, format))
485
486 def report_rtmp_download(self):
487 """Indicate the download will use the RTMP protocol."""
488 self.to_screen('RTMP download detected')
489
490 def _signature_cache_id(self, example_sig):
491 """ Return a string representation of a signature """
492 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
493
494 def _extract_signature_function(self, video_id, player_url, example_sig):
495 id_m = re.match(
496 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
497 player_url)
498 if not id_m:
499 raise ExtractorError('Cannot identify player %r' % player_url)
500 player_type = id_m.group('ext')
501 player_id = id_m.group('id')
502
503 # Read from filesystem cache
504 func_id = '%s_%s_%s' % (
505 player_type, player_id, self._signature_cache_id(example_sig))
506 assert os.path.basename(func_id) == func_id
507
508 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
509 if cache_spec is not None:
510 return lambda s: ''.join(s[i] for i in cache_spec)
511
512 if player_type == 'js':
513 code = self._download_webpage(
514 player_url, video_id,
515 note='Downloading %s player %s' % (player_type, player_id),
516 errnote='Download of %s failed' % player_url)
517 res = self._parse_sig_js(code)
518 elif player_type == 'swf':
519 urlh = self._request_webpage(
520 player_url, video_id,
521 note='Downloading %s player %s' % (player_type, player_id),
522 errnote='Download of %s failed' % player_url)
523 code = urlh.read()
524 res = self._parse_sig_swf(code)
525 else:
526 assert False, 'Invalid player type %r' % player_type
527
528 if cache_spec is None:
529 test_string = ''.join(map(compat_chr, range(len(example_sig))))
530 cache_res = res(test_string)
531 cache_spec = [ord(c) for c in cache_res]
532
533 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
534 return res
535
536 def _print_sig_code(self, func, example_sig):
537 def gen_sig_code(idxs):
538 def _genslice(start, end, step):
539 starts = '' if start == 0 else str(start)
540 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
541 steps = '' if step == 1 else (':%d' % step)
542 return 's[%s%s%s]' % (starts, ends, steps)
543
544 step = None
545 # Quelch pyflakes warnings - start will be set when step is set
546 start = '(Never used)'
547 for i, prev in zip(idxs[1:], idxs[:-1]):
548 if step is not None:
549 if i - prev == step:
550 continue
551 yield _genslice(start, prev, step)
552 step = None
553 continue
554 if i - prev in [-1, 1]:
555 step = i - prev
556 start = prev
557 continue
558 else:
559 yield 's[%d]' % prev
560 if step is None:
561 yield 's[%d]' % i
562 else:
563 yield _genslice(start, i, step)
564
565 test_string = ''.join(map(compat_chr, range(len(example_sig))))
566 cache_res = func(test_string)
567 cache_spec = [ord(c) for c in cache_res]
568 expr_code = ' + '.join(gen_sig_code(cache_spec))
569 signature_id_tuple = '(%s)' % (
570 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
571 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
572 ' return %s\n') % (signature_id_tuple, expr_code)
573 self.to_screen('Extracted signature function:\n' + code)
574
575 def _parse_sig_js(self, jscode):
576 funcname = self._search_regex(
577 r'\.sig\|\|([a-zA-Z0-9]+)\(', jscode,
578 'Initial JS player signature function name')
579
580 jsi = JSInterpreter(jscode)
581 initial_function = jsi.extract_function(funcname)
582 return lambda s: initial_function([s])
583
584 def _parse_sig_swf(self, file_contents):
585 swfi = SWFInterpreter(file_contents)
586 TARGET_CLASSNAME = 'SignatureDecipher'
587 searched_class = swfi.extract_class(TARGET_CLASSNAME)
588 initial_function = swfi.extract_function(searched_class, 'decipher')
589 return lambda s: initial_function([s])
590
591 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
592 """Turn the encrypted s field into a working signature"""
593
594 if player_url is None:
595 raise ExtractorError('Cannot decrypt signature without player_url')
596
597 if player_url.startswith('//'):
598 player_url = 'https:' + player_url
599 try:
600 player_id = (player_url, self._signature_cache_id(s))
601 if player_id not in self._player_cache:
602 func = self._extract_signature_function(
603 video_id, player_url, s
604 )
605 self._player_cache[player_id] = func
606 func = self._player_cache[player_id]
607 if self._downloader.params.get('youtube_print_sig_code'):
608 self._print_sig_code(func, s)
609 return func(s)
610 except Exception as e:
611 tb = traceback.format_exc()
612 raise ExtractorError(
613 'Signature extraction failed: ' + tb, cause=e)
614
615 def _get_available_subtitles(self, video_id, webpage):
616 try:
617 subs_doc = self._download_xml(
618 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
619 video_id, note=False)
620 except ExtractorError as err:
621 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
622 return {}
623
624 sub_lang_list = {}
625 for track in subs_doc.findall('track'):
626 lang = track.attrib['lang_code']
627 if lang in sub_lang_list:
628 continue
629 params = compat_urllib_parse.urlencode({
630 'lang': lang,
631 'v': video_id,
632 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
633 'name': track.attrib['name'].encode('utf-8'),
634 })
635 url = 'https://www.youtube.com/api/timedtext?' + params
636 sub_lang_list[lang] = url
637 if not sub_lang_list:
638 self._downloader.report_warning('video doesn\'t have subtitles')
639 return {}
640 return sub_lang_list
641
642 def _get_available_automatic_caption(self, video_id, webpage):
643 """We need the webpage for getting the captions url, pass it as an
644 argument to speed up the process."""
645 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
646 self.to_screen('%s: Looking for automatic captions' % video_id)
647 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
648 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
649 if mobj is None:
650 self._downloader.report_warning(err_msg)
651 return {}
652 player_config = json.loads(mobj.group(1))
653 try:
654 args = player_config['args']
655 caption_url = args['ttsurl']
656 timestamp = args['timestamp']
657 # We get the available subtitles
658 list_params = compat_urllib_parse.urlencode({
659 'type': 'list',
660 'tlangs': 1,
661 'asrs': 1,
662 })
663 list_url = caption_url + '&' + list_params
664 caption_list = self._download_xml(list_url, video_id)
665 original_lang_node = caption_list.find('track')
666 if original_lang_node is None:
667 self._downloader.report_warning('Video doesn\'t have automatic captions')
668 return {}
669 original_lang = original_lang_node.attrib['lang_code']
670 caption_kind = original_lang_node.attrib.get('kind', '')
671
672 sub_lang_list = {}
673 for lang_node in caption_list.findall('target'):
674 sub_lang = lang_node.attrib['lang_code']
675 params = compat_urllib_parse.urlencode({
676 'lang': original_lang,
677 'tlang': sub_lang,
678 'fmt': sub_format,
679 'ts': timestamp,
680 'kind': caption_kind,
681 })
682 sub_lang_list[sub_lang] = caption_url + '&' + params
683 return sub_lang_list
684 # An extractor error can be raise by the download process if there are
685 # no automatic captions but there are subtitles
686 except (KeyError, ExtractorError):
687 self._downloader.report_warning(err_msg)
688 return {}
689
690 @classmethod
691 def extract_id(cls, url):
692 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
693 if mobj is None:
694 raise ExtractorError('Invalid URL: %s' % url)
695 video_id = mobj.group(2)
696 return video_id
697
698 def _extract_from_m3u8(self, manifest_url, video_id):
699 url_map = {}
700
701 def _get_urls(_manifest):
702 lines = _manifest.split('\n')
703 urls = filter(lambda l: l and not l.startswith('#'),
704 lines)
705 return urls
706 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
707 formats_urls = _get_urls(manifest)
708 for format_url in formats_urls:
709 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
710 url_map[itag] = format_url
711 return url_map
712
713 def _extract_annotations(self, video_id):
714 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
715 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
716
717 def _parse_dash_manifest(
718 self, video_id, dash_manifest_url, player_url, age_gate):
719 def decrypt_sig(mobj):
720 s = mobj.group(1)
721 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
722 return '/signature/%s' % dec_s
723 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
724 dash_doc = self._download_xml(
725 dash_manifest_url, video_id,
726 note='Downloading DASH manifest',
727 errnote='Could not download DASH manifest')
728
729 formats = []
730 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
731 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
732 if url_el is None:
733 continue
734 format_id = r.attrib['id']
735 video_url = url_el.text
736 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
737 f = {
738 'format_id': format_id,
739 'url': video_url,
740 'width': int_or_none(r.attrib.get('width')),
741 'height': int_or_none(r.attrib.get('height')),
742 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
743 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
744 'filesize': filesize,
745 'fps': int_or_none(r.attrib.get('frameRate')),
746 }
747 try:
748 existing_format = next(
749 fo for fo in formats
750 if fo['format_id'] == format_id)
751 except StopIteration:
752 f.update(self._formats.get(format_id, {}).items())
753 formats.append(f)
754 else:
755 existing_format.update(f)
756 return formats
757
758 def _real_extract(self, url):
759 proto = (
760 'http' if self._downloader.params.get('prefer_insecure', False)
761 else 'https')
762
763 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
764 mobj = re.search(self._NEXT_URL_RE, url)
765 if mobj:
766 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
767 video_id = self.extract_id(url)
768
769 # Get video webpage
770 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
771 video_webpage = self._download_webpage(url, video_id)
772
773 # Attempt to extract SWF player URL
774 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
775 if mobj is not None:
776 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
777 else:
778 player_url = None
779
780 # Get video info
781 if re.search(r'player-age-gate-content">', video_webpage) is not None:
782 age_gate = True
783 # We simulate the access to the video from www.youtube.com/v/{video_id}
784 # this can be viewed without login into Youtube
785 url = proto + '://www.youtube.com/embed/%s' % video_id
786 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
787 data = compat_urllib_parse.urlencode({
788 'video_id': video_id,
789 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
790 'sts': self._search_regex(
791 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
792 })
793 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
794 video_info_webpage = self._download_webpage(
795 video_info_url, video_id,
796 note='Refetching age-gated info webpage',
797 errnote='unable to download video info webpage')
798 video_info = compat_parse_qs(video_info_webpage)
799 else:
800 age_gate = False
801 try:
802 # Try looking directly into the video webpage
803 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
804 if not mobj:
805 raise ValueError('Could not find ytplayer.config') # caught below
806 json_code = uppercase_escape(mobj.group(1))
807 ytplayer_config = json.loads(json_code)
808 args = ytplayer_config['args']
809 # Convert to the same format returned by compat_parse_qs
810 video_info = dict((k, [v]) for k, v in args.items())
811 if 'url_encoded_fmt_stream_map' not in args:
812 raise ValueError('No stream_map present') # caught below
813 except ValueError:
814 # We fallback to the get_video_info pages (used by the embed page)
815 self.report_video_info_webpage_download(video_id)
816 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
817 video_info_url = (
818 '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
819 % (proto, video_id, el_type))
820 video_info_webpage = self._download_webpage(
821 video_info_url,
822 video_id, note=False,
823 errnote='unable to download video info webpage')
824 video_info = compat_parse_qs(video_info_webpage)
825 if 'token' in video_info:
826 break
827 if 'token' not in video_info:
828 if 'reason' in video_info:
829 raise ExtractorError(
830 'YouTube said: %s' % video_info['reason'][0],
831 expected=True, video_id=video_id)
832 else:
833 raise ExtractorError(
834 '"token" parameter not in video info for unknown reason',
835 video_id=video_id)
836
837 if 'view_count' in video_info:
838 view_count = int(video_info['view_count'][0])
839 else:
840 view_count = None
841
842 # Check for "rental" videos
843 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
844 raise ExtractorError('"rental" videos not supported')
845
846 # Start extracting information
847 self.report_information_extraction(video_id)
848
849 # uploader
850 if 'author' not in video_info:
851 raise ExtractorError('Unable to extract uploader name')
852 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
853
854 # uploader_id
855 video_uploader_id = None
856 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
857 if mobj is not None:
858 video_uploader_id = mobj.group(1)
859 else:
860 self._downloader.report_warning('unable to extract uploader nickname')
861
862 # title
863 if 'title' in video_info:
864 video_title = video_info['title'][0]
865 else:
866 self._downloader.report_warning('Unable to extract video title')
867 video_title = '_'
868
869 # thumbnail image
870 # We try first to get a high quality image:
871 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
872 video_webpage, re.DOTALL)
873 if m_thumb is not None:
874 video_thumbnail = m_thumb.group(1)
875 elif 'thumbnail_url' not in video_info:
876 self._downloader.report_warning('unable to extract video thumbnail')
877 video_thumbnail = None
878 else: # don't panic if we can't find it
879 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
880
881 # upload date
882 upload_date = None
883 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
884 if mobj is None:
885 mobj = re.search(
886 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
887 video_webpage)
888 if mobj is not None:
889 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
890 upload_date = unified_strdate(upload_date)
891
892 m_cat_container = self._search_regex(
893 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
894 video_webpage, 'categories', default=None)
895 if m_cat_container:
896 category = self._html_search_regex(
897 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
898 default=None)
899 video_categories = None if category is None else [category]
900 else:
901 video_categories = None
902
903 # description
904 video_description = get_element_by_id("eow-description", video_webpage)
905 if video_description:
906 video_description = re.sub(r'''(?x)
907 <a\s+
908 (?:[a-zA-Z-]+="[^"]+"\s+)*?
909 title="([^"]+)"\s+
910 (?:[a-zA-Z-]+="[^"]+"\s+)*?
911 class="yt-uix-redirect-link"\s*>
912 [^<]+
913 </a>
914 ''', r'\1', video_description)
915 video_description = clean_html(video_description)
916 else:
917 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
918 if fd_mobj:
919 video_description = unescapeHTML(fd_mobj.group(1))
920 else:
921 video_description = ''
922
923 def _extract_count(count_name):
924 count = self._search_regex(
925 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
926 video_webpage, count_name, default=None)
927 if count is not None:
928 return int(count.replace(',', ''))
929 return None
930 like_count = _extract_count('like')
931 dislike_count = _extract_count('dislike')
932
933 # subtitles
934 video_subtitles = self.extract_subtitles(video_id, video_webpage)
935
936 if self._downloader.params.get('listsubtitles', False):
937 self._list_available_subtitles(video_id, video_webpage)
938 return
939
940 if 'length_seconds' not in video_info:
941 self._downloader.report_warning('unable to extract video duration')
942 video_duration = None
943 else:
944 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
945
946 # annotations
947 video_annotations = None
948 if self._downloader.params.get('writeannotations', False):
949 video_annotations = self._extract_annotations(video_id)
950
951 def _map_to_format_list(urlmap):
952 formats = []
953 for itag, video_real_url in urlmap.items():
954 dct = {
955 'format_id': itag,
956 'url': video_real_url,
957 'player_url': player_url,
958 }
959 if itag in self._formats:
960 dct.update(self._formats[itag])
961 formats.append(dct)
962 return formats
963
964 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
965 self.report_rtmp_download()
966 formats = [{
967 'format_id': '_rtmp',
968 'protocol': 'rtmp',
969 'url': video_info['conn'][0],
970 'player_url': player_url,
971 }]
972 elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
973 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
974 if 'rtmpe%3Dyes' in encoded_url_map:
975 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
976 url_map = {}
977 for url_data_str in encoded_url_map.split(','):
978 url_data = compat_parse_qs(url_data_str)
979 if 'itag' not in url_data or 'url' not in url_data:
980 continue
981 format_id = url_data['itag'][0]
982 url = url_data['url'][0]
983
984 if 'sig' in url_data:
985 url += '&signature=' + url_data['sig'][0]
986 elif 's' in url_data:
987 encrypted_sig = url_data['s'][0]
988
989 jsplayer_url_json = self._search_regex(
990 r'"assets":.+?"js":\s*("[^"]+")',
991 embed_webpage if age_gate else video_webpage, 'JS player URL')
992 player_url = json.loads(jsplayer_url_json)
993 if player_url is None:
994 player_url_json = self._search_regex(
995 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
996 video_webpage, 'age gate player URL')
997 player_url = json.loads(player_url_json)
998
999 if self._downloader.params.get('verbose'):
1000 if player_url is None:
1001 player_version = 'unknown'
1002 player_desc = 'unknown'
1003 else:
1004 if player_url.endswith('swf'):
1005 player_version = self._search_regex(
1006 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
1007 'flash player', fatal=False)
1008 player_desc = 'flash player %s' % player_version
1009 else:
1010 player_version = self._search_regex(
1011 r'html5player-([^/]+?)(?:/html5player)?\.js',
1012 player_url,
1013 'html5 player', fatal=False)
1014 player_desc = 'html5 player %s' % player_version
1015
1016 parts_sizes = self._signature_cache_id(encrypted_sig)
1017 self.to_screen('{%s} signature length %s, %s' %
1018 (format_id, parts_sizes, player_desc))
1019
1020 signature = self._decrypt_signature(
1021 encrypted_sig, video_id, player_url, age_gate)
1022 url += '&signature=' + signature
1023 if 'ratebypass' not in url:
1024 url += '&ratebypass=yes'
1025 url_map[format_id] = url
1026 formats = _map_to_format_list(url_map)
1027 elif video_info.get('hlsvp'):
1028 manifest_url = video_info['hlsvp'][0]
1029 url_map = self._extract_from_m3u8(manifest_url, video_id)
1030 formats = _map_to_format_list(url_map)
1031 else:
1032 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1033
1034 # Look for the DASH manifest
1035 if self._downloader.params.get('youtube_include_dash_manifest', True):
1036 dash_mpd = video_info.get('dashmpd')
1037 if dash_mpd:
1038 dash_manifest_url = dash_mpd[0]
1039 try:
1040 dash_formats = self._parse_dash_manifest(
1041 video_id, dash_manifest_url, player_url, age_gate)
1042 except (ExtractorError, KeyError) as e:
1043 self.report_warning(
1044 'Skipping DASH manifest: %r' % e, video_id)
1045 else:
1046 # Hide the formats we found through non-DASH
1047 dash_keys = set(df['format_id'] for df in dash_formats)
1048 for f in formats:
1049 if f['format_id'] in dash_keys:
1050 f['format_id'] = 'nondash-%s' % f['format_id']
1051 f['preference'] = f.get('preference', 0) - 10000
1052 formats.extend(dash_formats)
1053
1054 self._sort_formats(formats)
1055
1056 return {
1057 'id': video_id,
1058 'uploader': video_uploader,
1059 'uploader_id': video_uploader_id,
1060 'upload_date': upload_date,
1061 'title': video_title,
1062 'thumbnail': video_thumbnail,
1063 'description': video_description,
1064 'categories': video_categories,
1065 'subtitles': video_subtitles,
1066 'duration': video_duration,
1067 'age_limit': 18 if age_gate else 0,
1068 'annotations': video_annotations,
1069 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1070 'view_count': view_count,
1071 'like_count': like_count,
1072 'dislike_count': dislike_count,
1073 'formats': formats,
1074 }
1075
1076
1077class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1078 IE_DESC = 'YouTube.com playlists'
1079 _VALID_URL = r"""(?x)(?:
1080 (?:https?://)?
1081 (?:\w+\.)?
1082 youtube\.com/
1083 (?:
1084 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
1085 \? (?:.*?&)*? (?:p|a|list)=
1086 | p/
1087 )
1088 (
1089 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1090 # Top tracks, they can also include dots
1091 |(?:MC)[\w\.]*
1092 )
1093 .*
1094 |
1095 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1096 )"""
1097 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1098 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1099 IE_NAME = 'youtube:playlist'
1100 _TESTS = [{
1101 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1102 'info_dict': {
1103 'title': 'ytdl test PL',
1104 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1105 },
1106 'playlist_count': 3,
1107 }, {
1108 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1109 'info_dict': {
1110 'title': 'YDL_Empty_List',
1111 },
1112 'playlist_count': 0,
1113 }, {
1114 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1115 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1116 'info_dict': {
1117 'title': '29C3: Not my department',
1118 },
1119 'playlist_count': 95,
1120 }, {
1121 'note': 'issue #673',
1122 'url': 'PLBB231211A4F62143',
1123 'info_dict': {
1124 'title': '[OLD]Team Fortress 2 (Class-based LP)',
1125 },
1126 'playlist_mincount': 26,
1127 }, {
1128 'note': 'Large playlist',
1129 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1130 'info_dict': {
1131 'title': 'Uploads from Cauchemar',
1132 },
1133 'playlist_mincount': 799,
1134 }, {
1135 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1136 'info_dict': {
1137 'title': 'YDL_safe_search',
1138 },
1139 'playlist_count': 2,
1140 }, {
1141 'note': 'embedded',
1142 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1143 'playlist_count': 4,
1144 'info_dict': {
1145 'title': 'JODA15',
1146 }
1147 }, {
1148 'note': 'Embedded SWF player',
1149 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1150 'playlist_count': 4,
1151 'info_dict': {
1152 'title': 'JODA7',
1153 }
1154 }, {
1155 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
1156 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
1157 'info_dict': {
1158 'title': 'Uploads from Interstellar Movie',
1159 },
1160 'playlist_mincout': 21,
1161 }]
1162
1163 def _real_initialize(self):
1164 self._login()
1165
1166 def _ids_to_results(self, ids):
1167 return [
1168 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1169 for vid_id in ids]
1170
1171 def _extract_mix(self, playlist_id):
1172 # The mixes are generated from a a single video
1173 # the id of the playlist is just 'RD' + video_id
1174 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1175 webpage = self._download_webpage(
1176 url, playlist_id, 'Downloading Youtube mix')
1177 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1178 title_span = (
1179 search_title('playlist-title') or
1180 search_title('title long-title') or
1181 search_title('title'))
1182 title = clean_html(title_span)
1183 ids = orderedSet(re.findall(
1184 r'''(?xs)data-video-username=".*?".*?
1185 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1186 webpage))
1187 url_results = self._ids_to_results(ids)
1188
1189 return self.playlist_result(url_results, playlist_id, title)
1190
1191 def _real_extract(self, url):
1192 # Extract playlist id
1193 mobj = re.match(self._VALID_URL, url)
1194 if mobj is None:
1195 raise ExtractorError('Invalid URL: %s' % url)
1196 playlist_id = mobj.group(1) or mobj.group(2)
1197
1198 # Check if it's a video-specific URL
1199 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1200 if 'v' in query_dict:
1201 video_id = query_dict['v'][0]
1202 if self._downloader.params.get('noplaylist'):
1203 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1204 return self.url_result(video_id, 'Youtube', video_id=video_id)
1205 else:
1206 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1207
1208 if playlist_id.startswith('RD'):
1209 # Mixes require a custom extraction process
1210 return self._extract_mix(playlist_id)
1211
1212 url = self._TEMPLATE_URL % playlist_id
1213 page = self._download_webpage(url, playlist_id)
1214 more_widget_html = content_html = page
1215
1216 # Check if the playlist exists or is private
1217 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1218 raise ExtractorError(
1219 'The playlist doesn\'t exist or is private, use --username or '
1220 '--netrc to access it.',
1221 expected=True)
1222
1223 # Extract the video ids from the playlist pages
1224 ids = []
1225
1226 for page_num in itertools.count(1):
1227 matches = re.finditer(self._VIDEO_RE, content_html)
1228 # We remove the duplicates and the link with index 0
1229 # (it's not the first video of the playlist)
1230 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1231 ids.extend(new_ids)
1232
1233 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1234 if not mobj:
1235 break
1236
1237 more = self._download_json(
1238 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1239 'Downloading page #%s' % page_num,
1240 transform_source=uppercase_escape)
1241 content_html = more['content_html']
1242 if not content_html.strip():
1243 # Some webpages show a "Load more" button but they don't
1244 # have more videos
1245 break
1246 more_widget_html = more['load_more_widget_html']
1247
1248 playlist_title = self._html_search_regex(
1249 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1250 page, 'title')
1251
1252 url_results = self._ids_to_results(ids)
1253 return self.playlist_result(url_results, playlist_id, playlist_title)
1254
1255
1256class YoutubeChannelIE(InfoExtractor):
1257 IE_DESC = 'YouTube.com channels'
1258 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
1259 IE_NAME = 'youtube:channel'
1260 _TESTS = [{
1261 'note': 'paginated channel',
1262 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1263 'playlist_mincount': 91,
1264 }]
1265
1266 def extract_videos_from_page(self, page):
1267 ids_in_page = []
1268 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1269 if mobj.group(1) not in ids_in_page:
1270 ids_in_page.append(mobj.group(1))
1271 return ids_in_page
1272
1273 def _real_extract(self, url):
1274 channel_id = self._match_id(url)
1275
1276 video_ids = []
1277 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1278 channel_page = self._download_webpage(url, channel_id)
1279 autogenerated = re.search(r'''(?x)
1280 class="[^"]*?(?:
1281 channel-header-autogenerated-label|
1282 yt-channel-title-autogenerated
1283 )[^"]*"''', channel_page) is not None
1284
1285 if autogenerated:
1286 # The videos are contained in a single page
1287 # the ajax pages can't be used, they are empty
1288 video_ids = self.extract_videos_from_page(channel_page)
1289 entries = [
1290 self.url_result(video_id, 'Youtube', video_id=video_id)
1291 for video_id in video_ids]
1292 return self.playlist_result(entries, channel_id)
1293
1294 def _entries():
1295 more_widget_html = content_html = channel_page
1296 for pagenum in itertools.count(1):
1297
1298 ids_in_page = self.extract_videos_from_page(content_html)
1299 for video_id in ids_in_page:
1300 yield self.url_result(
1301 video_id, 'Youtube', video_id=video_id)
1302
1303 mobj = re.search(
1304 r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
1305 more_widget_html)
1306 if not mobj:
1307 break
1308
1309 more = self._download_json(
1310 'https://youtube.com/%s' % mobj.group('more'), channel_id,
1311 'Downloading page #%s' % (pagenum + 1),
1312 transform_source=uppercase_escape)
1313 content_html = more['content_html']
1314 more_widget_html = more['load_more_widget_html']
1315
1316 return self.playlist_result(_entries(), channel_id)
1317
1318
1319class YoutubeUserIE(InfoExtractor):
1320 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
1321 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
1322 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1323 _GDATA_PAGE_SIZE = 50
1324 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1325 IE_NAME = 'youtube:user'
1326
1327 _TESTS = [{
1328 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1329 'playlist_mincount': 320,
1330 'info_dict': {
1331 'title': 'TheLinuxFoundation',
1332 }
1333 }, {
1334 'url': 'ytuser:phihag',
1335 'only_matching': True,
1336 }]
1337
1338 @classmethod
1339 def suitable(cls, url):
1340 # Don't return True if the url can be extracted with other youtube
1341 # extractor, the regex would is too permissive and it would match.
1342 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1343 if any(ie.suitable(url) for ie in other_ies):
1344 return False
1345 else:
1346 return super(YoutubeUserIE, cls).suitable(url)
1347
1348 def _real_extract(self, url):
1349 username = self._match_id(url)
1350
1351 # Download video ids using YouTube Data API. Result size per
1352 # query is limited (currently to 50 videos) so we need to query
1353 # page by page until there are no video ids - it means we got
1354 # all of them.
1355
1356 def download_page(pagenum):
1357 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1358
1359 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1360 page = self._download_webpage(
1361 gdata_url, username,
1362 'Downloading video ids from %d to %d' % (
1363 start_index, start_index + self._GDATA_PAGE_SIZE))
1364
1365 try:
1366 response = json.loads(page)
1367 except ValueError as err:
1368 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
1369 if 'entry' not in response['feed']:
1370 return
1371
1372 # Extract video identifiers
1373 entries = response['feed']['entry']
1374 for entry in entries:
1375 title = entry['title']['$t']
1376 video_id = entry['id']['$t'].split('/')[-1]
1377 yield {
1378 '_type': 'url',
1379 'url': video_id,
1380 'ie_key': 'Youtube',
1381 'id': video_id,
1382 'title': title,
1383 }
1384 url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
1385
1386 return self.playlist_result(url_results, playlist_title=username)
1387
1388
1389class YoutubeSearchIE(SearchInfoExtractor):
1390 IE_DESC = 'YouTube.com searches'
1391 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1392 _MAX_RESULTS = 1000
1393 IE_NAME = 'youtube:search'
1394 _SEARCH_KEY = 'ytsearch'
1395
1396 def _get_n_results(self, query, n):
1397 """Get a specified number of results for a query"""
1398
1399 video_ids = []
1400 pagenum = 0
1401 limit = n
1402 PAGE_SIZE = 50
1403
1404 while (PAGE_SIZE * pagenum) < limit:
1405 result_url = self._API_URL % (
1406 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1407 (PAGE_SIZE * pagenum) + 1)
1408 data_json = self._download_webpage(
1409 result_url, video_id='query "%s"' % query,
1410 note='Downloading page %s' % (pagenum + 1),
1411 errnote='Unable to download API page')
1412 data = json.loads(data_json)
1413 api_response = data['data']
1414
1415 if 'items' not in api_response:
1416 raise ExtractorError(
1417 '[youtube] No video results', expected=True)
1418
1419 new_ids = list(video['id'] for video in api_response['items'])
1420 video_ids += new_ids
1421
1422 limit = min(n, api_response['totalItems'])
1423 pagenum += 1
1424
1425 if len(video_ids) > n:
1426 video_ids = video_ids[:n]
1427 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1428 for video_id in video_ids]
1429 return self.playlist_result(videos, query)
1430
1431
1432class YoutubeSearchDateIE(YoutubeSearchIE):
1433 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1434 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1435 _SEARCH_KEY = 'ytsearchdate'
1436 IE_DESC = 'YouTube.com searches, newest videos first'
1437
1438
1439class YoutubeSearchURLIE(InfoExtractor):
1440 IE_DESC = 'YouTube.com search URLs'
1441 IE_NAME = 'youtube:search_url'
1442 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1443 _TESTS = [{
1444 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1445 'playlist_mincount': 5,
1446 'info_dict': {
1447 'title': 'youtube-dl test video',
1448 }
1449 }]
1450
1451 def _real_extract(self, url):
1452 mobj = re.match(self._VALID_URL, url)
1453 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1454
1455 webpage = self._download_webpage(url, query)
1456 result_code = self._search_regex(
1457 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
1458
1459 part_codes = re.findall(
1460 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1461 entries = []
1462 for part_code in part_codes:
1463 part_title = self._html_search_regex(
1464 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1465 part_url_snippet = self._html_search_regex(
1466 r'(?s)href="([^"]+)"', part_code, 'item URL')
1467 part_url = compat_urlparse.urljoin(
1468 'https://www.youtube.com/', part_url_snippet)
1469 entries.append({
1470 '_type': 'url',
1471 'url': part_url,
1472 'title': part_title,
1473 })
1474
1475 return {
1476 '_type': 'playlist',
1477 'entries': entries,
1478 'title': query,
1479 }
1480
1481
1482class YoutubeShowIE(InfoExtractor):
1483 IE_DESC = 'YouTube.com (multi-season) shows'
1484 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
1485 IE_NAME = 'youtube:show'
1486 _TESTS = [{
1487 'url': 'http://www.youtube.com/show/airdisasters',
1488 'playlist_mincount': 3,
1489 'info_dict': {
1490 'id': 'airdisasters',
1491 'title': 'Air Disasters',
1492 }
1493 }]
1494
1495 def _real_extract(self, url):
1496 mobj = re.match(self._VALID_URL, url)
1497 playlist_id = mobj.group('id')
1498 webpage = self._download_webpage(
1499 url, playlist_id, 'Downloading show webpage')
1500 # There's one playlist for each season of the show
1501 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1502 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1503 entries = [
1504 self.url_result(
1505 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1506 for season in m_seasons
1507 ]
1508 title = self._og_search_title(webpage, fatal=False)
1509
1510 return {
1511 '_type': 'playlist',
1512 'id': playlist_id,
1513 'title': title,
1514 'entries': entries,
1515 }
1516
1517
1518class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1519 """
1520 Base class for extractors that fetch info from
1521 http://www.youtube.com/feed_ajax
1522 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1523 """
1524 _LOGIN_REQUIRED = True
1525 # use action_load_personal_feed instead of action_load_system_feed
1526 _PERSONAL_FEED = False
1527
1528 @property
1529 def _FEED_TEMPLATE(self):
1530 action = 'action_load_system_feed'
1531 if self._PERSONAL_FEED:
1532 action = 'action_load_personal_feed'
1533 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1534
1535 @property
1536 def IE_NAME(self):
1537 return 'youtube:%s' % self._FEED_NAME
1538
1539 def _real_initialize(self):
1540 self._login()
1541
1542 def _real_extract(self, url):
1543 feed_entries = []
1544 paging = 0
1545 for i in itertools.count(1):
1546 info = self._download_json(
1547 self._FEED_TEMPLATE % paging,
1548 '%s feed' % self._FEED_NAME,
1549 'Downloading page %s' % i,
1550 transform_source=uppercase_escape)
1551 feed_html = info.get('feed_html') or info.get('content_html')
1552 load_more_widget_html = info.get('load_more_widget_html') or feed_html
1553 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1554 ids = orderedSet(m.group(1) for m in m_ids)
1555 feed_entries.extend(
1556 self.url_result(video_id, 'Youtube', video_id=video_id)
1557 for video_id in ids)
1558 mobj = re.search(
1559 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1560 load_more_widget_html)
1561 if mobj is None:
1562 break
1563 paging = mobj.group('paging')
1564 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1565
1566
1567class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1568 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
1569 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1570 _FEED_NAME = 'recommended'
1571 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1572
1573
1574class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1575 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
1576 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1577 _FEED_NAME = 'watch_later'
1578 _PLAYLIST_TITLE = 'Youtube Watch Later'
1579 _PERSONAL_FEED = True
1580
1581
1582class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1583 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
1584 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1585 _FEED_NAME = 'history'
1586 _PERSONAL_FEED = True
1587 _PLAYLIST_TITLE = 'Youtube Watch History'
1588
1589
1590class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1591 IE_NAME = 'youtube:favorites'
1592 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
1593 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1594 _LOGIN_REQUIRED = True
1595
1596 def _real_extract(self, url):
1597 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1598 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
1599 return self.url_result(playlist_id, 'YoutubePlaylist')
1600
1601
1602class YoutubeSubscriptionsIE(YoutubePlaylistIE):
1603 IE_NAME = 'youtube:subscriptions'
1604 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1605 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1606 _TESTS = []
1607
1608 def _real_extract(self, url):
1609 title = 'Youtube Subscriptions'
1610 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1611
1612 # The extraction process is the same as for playlists, but the regex
1613 # for the video ids doesn't contain an index
1614 ids = []
1615 more_widget_html = content_html = page
1616
1617 for page_num in itertools.count(1):
1618 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1619 new_ids = orderedSet(matches)
1620 ids.extend(new_ids)
1621
1622 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1623 if not mobj:
1624 break
1625
1626 more = self._download_json(
1627 'https://youtube.com/%s' % mobj.group('more'), title,
1628 'Downloading page #%s' % page_num,
1629 transform_source=uppercase_escape)
1630 content_html = more['content_html']
1631 more_widget_html = more['load_more_widget_html']
1632
1633 return {
1634 '_type': 'playlist',
1635 'title': title,
1636 'entries': self._ids_to_results(ids),
1637 }
1638
1639
1640class YoutubeTruncatedURLIE(InfoExtractor):
1641 IE_NAME = 'youtube:truncated_url'
1642 IE_DESC = False # Do not list
1643 _VALID_URL = r'''(?x)
1644 (?:https?://)?[^/]+/watch\?(?:
1645 feature=[a-z_]+|
1646 annotation_id=annotation_[^&]+
1647 )?$|
1648 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1649 '''
1650
1651 _TESTS = [{
1652 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1653 'only_matching': True,
1654 }, {
1655 'url': 'http://www.youtube.com/watch?',
1656 'only_matching': True,
1657 }]
1658
1659 def _real_extract(self, url):
1660 raise ExtractorError(
1661 'Did you forget to quote the URL? Remember that & is a meta '
1662 'character in most shells, so you want to put the URL in quotes, '
1663 'like youtube-dl '
1664 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1665 ' or simply youtube-dl BaW_jenozKc .',
1666 expected=True)
1667
1668
1669class YoutubeTruncatedIDIE(InfoExtractor):
1670 IE_NAME = 'youtube:truncated_id'
1671 IE_DESC = False # Do not list
1672 _VALID_URL = r'https?://(?:www\.)youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
1673
1674 _TESTS = [{
1675 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
1676 'only_matching': True,
1677 }]
1678
1679 def _real_extract(self, url):
1680 video_id = self._match_id(url)
1681 raise ExtractorError(
1682 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
1683 expected=True)