]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/youtube.py
[dailymotion] Convert to new subtitles system
[yt-dlp.git] / youtube_dl / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5
6 import itertools
7 import json
8 import os.path
9 import re
10 import time
11 import traceback
12
13 from .common import InfoExtractor, SearchInfoExtractor
14 from .subtitles import SubtitlesInfoExtractor
15 from ..jsinterp import JSInterpreter
16 from ..swfinterp import SWFInterpreter
17 from ..compat import (
18 compat_chr,
19 compat_parse_qs,
20 compat_urllib_parse,
21 compat_urllib_request,
22 compat_urlparse,
23 compat_str,
24 )
25 from ..utils import (
26 clean_html,
27 ExtractorError,
28 float_or_none,
29 get_element_by_attribute,
30 get_element_by_id,
31 int_or_none,
32 OnDemandPagedList,
33 orderedSet,
34 unescapeHTML,
35 unified_strdate,
36 uppercase_escape,
37 )
38
39
40 class YoutubeBaseInfoExtractor(InfoExtractor):
41 """Provide base functions for Youtube extractors"""
42 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
43 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
44 _NETRC_MACHINE = 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED = False
47
48 def _set_language(self):
49 self._set_cookie(
50 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
51 # YouTube sets the expire time to about two months
52 expire_time=time.time() + 2 * 30 * 24 * 3600)
53
54 def _login(self):
55 """
56 Attempt to log in to YouTube.
57 True is returned if successful or skipped.
58 False is returned if login failed.
59
60 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
61 """
62 (username, password) = self._get_login_info()
63 # No authentication to be performed
64 if username is None:
65 if self._LOGIN_REQUIRED:
66 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
67 return True
68
69 login_page = self._download_webpage(
70 self._LOGIN_URL, None,
71 note='Downloading login page',
72 errnote='unable to fetch login page', fatal=False)
73 if login_page is False:
74 return
75
76 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
77 login_page, 'Login GALX parameter')
78
79 # Log in
80 login_form_strs = {
81 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
82 'Email': username,
83 'GALX': galx,
84 'Passwd': password,
85
86 'PersistentCookie': 'yes',
87 '_utf8': '霱',
88 'bgresponse': 'js_disabled',
89 'checkConnection': '',
90 'checkedDomains': 'youtube',
91 'dnConn': '',
92 'pstMsg': '0',
93 'rmShown': '1',
94 'secTok': '',
95 'signIn': 'Sign in',
96 'timeStmp': '',
97 'service': 'youtube',
98 'uilel': '3',
99 'hl': 'en_US',
100 }
101
102 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
103 # chokes on unicode
104 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
105 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
106
107 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
108 login_results = self._download_webpage(
109 req, None,
110 note='Logging in', errnote='unable to log in', fatal=False)
111 if login_results is False:
112 return False
113
114 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
115 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
116
117 # Two-Factor
118 # TODO add SMS and phone call support - these require making a request and then prompting the user
119
120 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
121 tfa_code = self._get_tfa_info()
122
123 if tfa_code is None:
124 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
125 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
126 return False
127
128 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
129
130 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
131 if match is None:
132 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
133 secTok = match.group(1)
134 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
135 if match is None:
136 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
137 timeStmp = match.group(1)
138
139 tfa_form_strs = {
140 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
141 'smsToken': '',
142 'smsUserPin': tfa_code,
143 'smsVerifyPin': 'Verify',
144
145 'PersistentCookie': 'yes',
146 'checkConnection': '',
147 'checkedDomains': 'youtube',
148 'pstMsg': '1',
149 'secTok': secTok,
150 'timeStmp': timeStmp,
151 'service': 'youtube',
152 'hl': 'en_US',
153 }
154 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
155 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
156
157 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
158 tfa_results = self._download_webpage(
159 tfa_req, None,
160 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
161
162 if tfa_results is False:
163 return False
164
165 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
166 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
167 return False
168 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
169 self._downloader.report_warning('unable to log in - did the page structure change?')
170 return False
171 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
172 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
173 return False
174
175 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
176 self._downloader.report_warning('unable to log in: bad username or password')
177 return False
178 return True
179
180 def _real_initialize(self):
181 if self._downloader is None:
182 return
183 self._set_language()
184 if not self._login():
185 return
186
187
188 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
189 IE_DESC = 'YouTube.com'
190 _VALID_URL = r"""(?x)^
191 (
192 (?:https?://|//) # http(s):// or protocol-independent URL
193 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
194 (?:www\.)?deturl\.com/www\.youtube\.com/|
195 (?:www\.)?pwnyoutube\.com/|
196 (?:www\.)?yourepeat\.com/|
197 tube\.majestyc\.net/|
198 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
199 (?:.*?\#/)? # handle anchor (#/) redirect urls
200 (?: # the various things that can precede the ID:
201 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
202 |(?: # or the v= param in all its forms
203 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
204 (?:\?|\#!?) # the params delimiter ? or # or #!
205 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
206 v=
207 )
208 ))
209 |youtu\.be/ # just youtu.be/xxxx
210 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
211 )
212 )? # all until now is optional -> you can pass the naked ID
213 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
214 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
215 (?(1).+)? # if we found the ID, everything can follow
216 $"""
217 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
218 _formats = {
219 '5': {'ext': 'flv', 'width': 400, 'height': 240},
220 '6': {'ext': 'flv', 'width': 450, 'height': 270},
221 '13': {'ext': '3gp'},
222 '17': {'ext': '3gp', 'width': 176, 'height': 144},
223 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
224 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
225 '34': {'ext': 'flv', 'width': 640, 'height': 360},
226 '35': {'ext': 'flv', 'width': 854, 'height': 480},
227 '36': {'ext': '3gp', 'width': 320, 'height': 240},
228 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
229 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
230 '43': {'ext': 'webm', 'width': 640, 'height': 360},
231 '44': {'ext': 'webm', 'width': 854, 'height': 480},
232 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
233 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
234
235
236 # 3d videos
237 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
238 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
239 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
240 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
241 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
242 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
243 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
244
245 # Apple HTTP Live Streaming
246 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
247 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
248 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
249 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
250 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
251 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
252 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
253
254 # DASH mp4 video
255 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
256 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
257 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
258 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
259 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
260 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
261 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
262 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
263 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
264 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
265 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
266
267 # Dash mp4 audio
268 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
269 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
270 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
271
272 # Dash webm
273 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
274 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
275 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
276 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
277 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
278 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
279 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
280 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
281 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
282 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
283 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
284 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
285 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
286 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
287 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
288 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
289 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
290 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
291 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
292 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
293 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
294
295 # Dash webm audio
296 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
297 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
298
299 # Dash webm audio with opus inside
300 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
301 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
302 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
303
304 # RTMP (unnamed)
305 '_rtmp': {'protocol': 'rtmp'},
306 }
307
308 IE_NAME = 'youtube'
309 _TESTS = [
310 {
311 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
312 'info_dict': {
313 'id': 'BaW_jenozKc',
314 'ext': 'mp4',
315 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
316 'uploader': 'Philipp Hagemeister',
317 'uploader_id': 'phihag',
318 'upload_date': '20121002',
319 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
320 'categories': ['Science & Technology'],
321 'like_count': int,
322 'dislike_count': int,
323 }
324 },
325 {
326 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
327 'note': 'Test generic use_cipher_signature video (#897)',
328 'info_dict': {
329 'id': 'UxxajLWwzqY',
330 'ext': 'mp4',
331 'upload_date': '20120506',
332 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
333 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
334 'uploader': 'Icona Pop',
335 'uploader_id': 'IconaPop',
336 }
337 },
338 {
339 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
340 'note': 'Test VEVO video with age protection (#956)',
341 'info_dict': {
342 'id': '07FYdnEawAQ',
343 'ext': 'mp4',
344 'upload_date': '20130703',
345 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
346 'description': 'md5:64249768eec3bc4276236606ea996373',
347 'uploader': 'justintimberlakeVEVO',
348 'uploader_id': 'justintimberlakeVEVO',
349 }
350 },
351 {
352 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
353 'note': 'Embed-only video (#1746)',
354 'info_dict': {
355 'id': 'yZIXLfi8CZQ',
356 'ext': 'mp4',
357 'upload_date': '20120608',
358 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
359 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
360 'uploader': 'SET India',
361 'uploader_id': 'setindia'
362 }
363 },
364 {
365 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
366 'note': '256k DASH audio (format 141) via DASH manifest',
367 'info_dict': {
368 'id': 'a9LDPn-MO4I',
369 'ext': 'm4a',
370 'upload_date': '20121002',
371 'uploader_id': '8KVIDEO',
372 'description': '',
373 'uploader': '8KVIDEO',
374 'title': 'UHDTV TEST 8K VIDEO.mp4'
375 },
376 'params': {
377 'youtube_include_dash_manifest': True,
378 'format': '141',
379 },
380 },
381 # DASH manifest with encrypted signature
382 {
383 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
384 'info_dict': {
385 'id': 'IB3lcPjvWLA',
386 'ext': 'm4a',
387 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
388 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
389 'uploader': 'AfrojackVEVO',
390 'uploader_id': 'AfrojackVEVO',
391 'upload_date': '20131011',
392 },
393 'params': {
394 'youtube_include_dash_manifest': True,
395 'format': '141',
396 },
397 },
398 # JS player signature function name containing $
399 {
400 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
401 'info_dict': {
402 'id': 'nfWlot6h_JM',
403 'ext': 'm4a',
404 'title': 'Taylor Swift - Shake It Off',
405 'description': 'md5:2acfda1b285bdd478ccec22f9918199d',
406 'uploader': 'TaylorSwiftVEVO',
407 'uploader_id': 'TaylorSwiftVEVO',
408 'upload_date': '20140818',
409 },
410 'params': {
411 'youtube_include_dash_manifest': True,
412 'format': '141',
413 },
414 },
415 # Controversy video
416 {
417 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
418 'info_dict': {
419 'id': 'T4XJQO3qol8',
420 'ext': 'mp4',
421 'upload_date': '20100909',
422 'uploader': 'The Amazing Atheist',
423 'uploader_id': 'TheAmazingAtheist',
424 'title': 'Burning Everyone\'s Koran',
425 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
426 }
427 },
428 # Normal age-gate video (No vevo, embed allowed)
429 {
430 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
431 'info_dict': {
432 'id': 'HtVdAasjOgU',
433 'ext': 'mp4',
434 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
435 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
436 'uploader': 'The Witcher',
437 'uploader_id': 'WitcherGame',
438 'upload_date': '20140605',
439 },
440 },
441 # Age-gate video with encrypted signature
442 {
443 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',
444 'info_dict': {
445 'id': '6kLq3WMV1nU',
446 'ext': 'mp4',
447 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
448 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
449 'uploader': 'LloydVEVO',
450 'uploader_id': 'LloydVEVO',
451 'upload_date': '20110629',
452 },
453 },
454 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
455 {
456 'url': '__2ABJjxzNo',
457 'info_dict': {
458 'id': '__2ABJjxzNo',
459 'ext': 'mp4',
460 'upload_date': '20100430',
461 'uploader_id': 'deadmau5',
462 'description': 'md5:12c56784b8032162bb936a5f76d55360',
463 'uploader': 'deadmau5',
464 'title': 'Deadmau5 - Some Chords (HD)',
465 },
466 'expected_warnings': [
467 'DASH manifest missing',
468 ]
469 },
470 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
471 {
472 'url': 'lqQg6PlCWgI',
473 'info_dict': {
474 'id': 'lqQg6PlCWgI',
475 'ext': 'mp4',
476 'upload_date': '20120731',
477 'uploader_id': 'olympic',
478 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
479 'uploader': 'Olympics',
480 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
481 },
482 'params': {
483 'skip_download': 'requires avconv',
484 }
485 },
486 # Non-square pixels
487 {
488 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
489 'info_dict': {
490 'id': '_b-2C3KPAM0',
491 'ext': 'mp4',
492 'stretched_ratio': 16 / 9.,
493 'upload_date': '20110310',
494 'uploader_id': 'AllenMeow',
495 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
496 'uploader': '孫艾倫',
497 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
498 },
499 }
500 ]
501
502 def __init__(self, *args, **kwargs):
503 super(YoutubeIE, self).__init__(*args, **kwargs)
504 self._player_cache = {}
505
506 def report_video_info_webpage_download(self, video_id):
507 """Report attempt to download video info webpage."""
508 self.to_screen('%s: Downloading video info webpage' % video_id)
509
510 def report_information_extraction(self, video_id):
511 """Report attempt to extract video information."""
512 self.to_screen('%s: Extracting video information' % video_id)
513
514 def report_unavailable_format(self, video_id, format):
515 """Report extracted video URL."""
516 self.to_screen('%s: Format %s not available' % (video_id, format))
517
518 def report_rtmp_download(self):
519 """Indicate the download will use the RTMP protocol."""
520 self.to_screen('RTMP download detected')
521
522 def _signature_cache_id(self, example_sig):
523 """ Return a string representation of a signature """
524 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
525
526 def _extract_signature_function(self, video_id, player_url, example_sig):
527 id_m = re.match(
528 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
529 player_url)
530 if not id_m:
531 raise ExtractorError('Cannot identify player %r' % player_url)
532 player_type = id_m.group('ext')
533 player_id = id_m.group('id')
534
535 # Read from filesystem cache
536 func_id = '%s_%s_%s' % (
537 player_type, player_id, self._signature_cache_id(example_sig))
538 assert os.path.basename(func_id) == func_id
539
540 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
541 if cache_spec is not None:
542 return lambda s: ''.join(s[i] for i in cache_spec)
543
544 if player_type == 'js':
545 code = self._download_webpage(
546 player_url, video_id,
547 note='Downloading %s player %s' % (player_type, player_id),
548 errnote='Download of %s failed' % player_url)
549 res = self._parse_sig_js(code)
550 elif player_type == 'swf':
551 urlh = self._request_webpage(
552 player_url, video_id,
553 note='Downloading %s player %s' % (player_type, player_id),
554 errnote='Download of %s failed' % player_url)
555 code = urlh.read()
556 res = self._parse_sig_swf(code)
557 else:
558 assert False, 'Invalid player type %r' % player_type
559
560 if cache_spec is None:
561 test_string = ''.join(map(compat_chr, range(len(example_sig))))
562 cache_res = res(test_string)
563 cache_spec = [ord(c) for c in cache_res]
564
565 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
566 return res
567
568 def _print_sig_code(self, func, example_sig):
569 def gen_sig_code(idxs):
570 def _genslice(start, end, step):
571 starts = '' if start == 0 else str(start)
572 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
573 steps = '' if step == 1 else (':%d' % step)
574 return 's[%s%s%s]' % (starts, ends, steps)
575
576 step = None
577 # Quelch pyflakes warnings - start will be set when step is set
578 start = '(Never used)'
579 for i, prev in zip(idxs[1:], idxs[:-1]):
580 if step is not None:
581 if i - prev == step:
582 continue
583 yield _genslice(start, prev, step)
584 step = None
585 continue
586 if i - prev in [-1, 1]:
587 step = i - prev
588 start = prev
589 continue
590 else:
591 yield 's[%d]' % prev
592 if step is None:
593 yield 's[%d]' % i
594 else:
595 yield _genslice(start, i, step)
596
597 test_string = ''.join(map(compat_chr, range(len(example_sig))))
598 cache_res = func(test_string)
599 cache_spec = [ord(c) for c in cache_res]
600 expr_code = ' + '.join(gen_sig_code(cache_spec))
601 signature_id_tuple = '(%s)' % (
602 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
603 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
604 ' return %s\n') % (signature_id_tuple, expr_code)
605 self.to_screen('Extracted signature function:\n' + code)
606
607 def _parse_sig_js(self, jscode):
608 funcname = self._search_regex(
609 r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode,
610 'Initial JS player signature function name')
611
612 jsi = JSInterpreter(jscode)
613 initial_function = jsi.extract_function(funcname)
614 return lambda s: initial_function([s])
615
616 def _parse_sig_swf(self, file_contents):
617 swfi = SWFInterpreter(file_contents)
618 TARGET_CLASSNAME = 'SignatureDecipher'
619 searched_class = swfi.extract_class(TARGET_CLASSNAME)
620 initial_function = swfi.extract_function(searched_class, 'decipher')
621 return lambda s: initial_function([s])
622
623 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
624 """Turn the encrypted s field into a working signature"""
625
626 if player_url is None:
627 raise ExtractorError('Cannot decrypt signature without player_url')
628
629 if player_url.startswith('//'):
630 player_url = 'https:' + player_url
631 try:
632 player_id = (player_url, self._signature_cache_id(s))
633 if player_id not in self._player_cache:
634 func = self._extract_signature_function(
635 video_id, player_url, s
636 )
637 self._player_cache[player_id] = func
638 func = self._player_cache[player_id]
639 if self._downloader.params.get('youtube_print_sig_code'):
640 self._print_sig_code(func, s)
641 return func(s)
642 except Exception as e:
643 tb = traceback.format_exc()
644 raise ExtractorError(
645 'Signature extraction failed: ' + tb, cause=e)
646
647 def _get_available_subtitles(self, video_id, webpage):
648 try:
649 subs_doc = self._download_xml(
650 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
651 video_id, note=False)
652 except ExtractorError as err:
653 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
654 return {}
655
656 sub_lang_list = {}
657 for track in subs_doc.findall('track'):
658 lang = track.attrib['lang_code']
659 if lang in sub_lang_list:
660 continue
661 params = compat_urllib_parse.urlencode({
662 'lang': lang,
663 'v': video_id,
664 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
665 'name': track.attrib['name'].encode('utf-8'),
666 })
667 url = 'https://www.youtube.com/api/timedtext?' + params
668 sub_lang_list[lang] = url
669 if not sub_lang_list:
670 self._downloader.report_warning('video doesn\'t have subtitles')
671 return {}
672 return sub_lang_list
673
674 def _get_available_automatic_caption(self, video_id, webpage):
675 """We need the webpage for getting the captions url, pass it as an
676 argument to speed up the process."""
677 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
678 self.to_screen('%s: Looking for automatic captions' % video_id)
679 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
680 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
681 if mobj is None:
682 self._downloader.report_warning(err_msg)
683 return {}
684 player_config = json.loads(mobj.group(1))
685 try:
686 args = player_config['args']
687 caption_url = args['ttsurl']
688 timestamp = args['timestamp']
689 # We get the available subtitles
690 list_params = compat_urllib_parse.urlencode({
691 'type': 'list',
692 'tlangs': 1,
693 'asrs': 1,
694 })
695 list_url = caption_url + '&' + list_params
696 caption_list = self._download_xml(list_url, video_id)
697 original_lang_node = caption_list.find('track')
698 if original_lang_node is None:
699 self._downloader.report_warning('Video doesn\'t have automatic captions')
700 return {}
701 original_lang = original_lang_node.attrib['lang_code']
702 caption_kind = original_lang_node.attrib.get('kind', '')
703
704 sub_lang_list = {}
705 for lang_node in caption_list.findall('target'):
706 sub_lang = lang_node.attrib['lang_code']
707 params = compat_urllib_parse.urlencode({
708 'lang': original_lang,
709 'tlang': sub_lang,
710 'fmt': sub_format,
711 'ts': timestamp,
712 'kind': caption_kind,
713 })
714 sub_lang_list[sub_lang] = caption_url + '&' + params
715 return sub_lang_list
716 # An extractor error can be raise by the download process if there are
717 # no automatic captions but there are subtitles
718 except (KeyError, ExtractorError):
719 self._downloader.report_warning(err_msg)
720 return {}
721
722 @classmethod
723 def extract_id(cls, url):
724 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
725 if mobj is None:
726 raise ExtractorError('Invalid URL: %s' % url)
727 video_id = mobj.group(2)
728 return video_id
729
730 def _extract_from_m3u8(self, manifest_url, video_id):
731 url_map = {}
732
733 def _get_urls(_manifest):
734 lines = _manifest.split('\n')
735 urls = filter(lambda l: l and not l.startswith('#'),
736 lines)
737 return urls
738 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
739 formats_urls = _get_urls(manifest)
740 for format_url in formats_urls:
741 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
742 url_map[itag] = format_url
743 return url_map
744
745 def _extract_annotations(self, video_id):
746 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
747 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
748
749 def _parse_dash_manifest(
750 self, video_id, dash_manifest_url, player_url, age_gate):
751 def decrypt_sig(mobj):
752 s = mobj.group(1)
753 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
754 return '/signature/%s' % dec_s
755 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
756 dash_doc = self._download_xml(
757 dash_manifest_url, video_id,
758 note='Downloading DASH manifest',
759 errnote='Could not download DASH manifest')
760
761 formats = []
762 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
763 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
764 if url_el is None:
765 continue
766 format_id = r.attrib['id']
767 video_url = url_el.text
768 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
769 f = {
770 'format_id': format_id,
771 'url': video_url,
772 'width': int_or_none(r.attrib.get('width')),
773 'height': int_or_none(r.attrib.get('height')),
774 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
775 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
776 'filesize': filesize,
777 'fps': int_or_none(r.attrib.get('frameRate')),
778 }
779 try:
780 existing_format = next(
781 fo for fo in formats
782 if fo['format_id'] == format_id)
783 except StopIteration:
784 full_info = self._formats.get(format_id, {}).copy()
785 full_info.update(f)
786 formats.append(full_info)
787 else:
788 existing_format.update(f)
789 return formats
790
791 def _real_extract(self, url):
792 proto = (
793 'http' if self._downloader.params.get('prefer_insecure', False)
794 else 'https')
795
796 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
797 mobj = re.search(self._NEXT_URL_RE, url)
798 if mobj:
799 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
800 video_id = self.extract_id(url)
801
802 # Get video webpage
803 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
804 video_webpage = self._download_webpage(url, video_id)
805
806 # Attempt to extract SWF player URL
807 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
808 if mobj is not None:
809 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
810 else:
811 player_url = None
812
813 # Get video info
814 embed_webpage = None
815 if re.search(r'player-age-gate-content">', video_webpage) is not None:
816 age_gate = True
817 # We simulate the access to the video from www.youtube.com/v/{video_id}
818 # this can be viewed without login into Youtube
819 url = proto + '://www.youtube.com/embed/%s' % video_id
820 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
821 data = compat_urllib_parse.urlencode({
822 'video_id': video_id,
823 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
824 'sts': self._search_regex(
825 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
826 })
827 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
828 video_info_webpage = self._download_webpage(
829 video_info_url, video_id,
830 note='Refetching age-gated info webpage',
831 errnote='unable to download video info webpage')
832 video_info = compat_parse_qs(video_info_webpage)
833 else:
834 age_gate = False
835 try:
836 # Try looking directly into the video webpage
837 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
838 if not mobj:
839 raise ValueError('Could not find ytplayer.config') # caught below
840 json_code = uppercase_escape(mobj.group(1))
841 ytplayer_config = json.loads(json_code)
842 args = ytplayer_config['args']
843 # Convert to the same format returned by compat_parse_qs
844 video_info = dict((k, [v]) for k, v in args.items())
845 if 'url_encoded_fmt_stream_map' not in args:
846 raise ValueError('No stream_map present') # caught below
847 except ValueError:
848 # We fallback to the get_video_info pages (used by the embed page)
849 self.report_video_info_webpage_download(video_id)
850 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
851 video_info_url = (
852 '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
853 % (proto, video_id, el_type))
854 video_info_webpage = self._download_webpage(
855 video_info_url,
856 video_id, note=False,
857 errnote='unable to download video info webpage')
858 video_info = compat_parse_qs(video_info_webpage)
859 if 'token' in video_info:
860 break
861 if 'token' not in video_info:
862 if 'reason' in video_info:
863 raise ExtractorError(
864 'YouTube said: %s' % video_info['reason'][0],
865 expected=True, video_id=video_id)
866 else:
867 raise ExtractorError(
868 '"token" parameter not in video info for unknown reason',
869 video_id=video_id)
870
871 if 'view_count' in video_info:
872 view_count = int(video_info['view_count'][0])
873 else:
874 view_count = None
875
876 # Check for "rental" videos
877 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
878 raise ExtractorError('"rental" videos not supported')
879
880 # Start extracting information
881 self.report_information_extraction(video_id)
882
883 # uploader
884 if 'author' not in video_info:
885 raise ExtractorError('Unable to extract uploader name')
886 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
887
888 # uploader_id
889 video_uploader_id = None
890 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
891 if mobj is not None:
892 video_uploader_id = mobj.group(1)
893 else:
894 self._downloader.report_warning('unable to extract uploader nickname')
895
896 # title
897 if 'title' in video_info:
898 video_title = video_info['title'][0]
899 else:
900 self._downloader.report_warning('Unable to extract video title')
901 video_title = '_'
902
903 # thumbnail image
904 # We try first to get a high quality image:
905 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
906 video_webpage, re.DOTALL)
907 if m_thumb is not None:
908 video_thumbnail = m_thumb.group(1)
909 elif 'thumbnail_url' not in video_info:
910 self._downloader.report_warning('unable to extract video thumbnail')
911 video_thumbnail = None
912 else: # don't panic if we can't find it
913 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
914
915 # upload date
916 upload_date = None
917 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
918 if mobj is None:
919 mobj = re.search(
920 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
921 video_webpage)
922 if mobj is not None:
923 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
924 upload_date = unified_strdate(upload_date)
925
926 m_cat_container = self._search_regex(
927 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
928 video_webpage, 'categories', default=None)
929 if m_cat_container:
930 category = self._html_search_regex(
931 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
932 default=None)
933 video_categories = None if category is None else [category]
934 else:
935 video_categories = None
936
937 # description
938 video_description = get_element_by_id("eow-description", video_webpage)
939 if video_description:
940 video_description = re.sub(r'''(?x)
941 <a\s+
942 (?:[a-zA-Z-]+="[^"]+"\s+)*?
943 title="([^"]+)"\s+
944 (?:[a-zA-Z-]+="[^"]+"\s+)*?
945 class="yt-uix-redirect-link"\s*>
946 [^<]+
947 </a>
948 ''', r'\1', video_description)
949 video_description = clean_html(video_description)
950 else:
951 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
952 if fd_mobj:
953 video_description = unescapeHTML(fd_mobj.group(1))
954 else:
955 video_description = ''
956
957 def _extract_count(count_name):
958 count = self._search_regex(
959 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
960 video_webpage, count_name, default=None)
961 if count is not None:
962 return int(count.replace(',', ''))
963 return None
964 like_count = _extract_count('like')
965 dislike_count = _extract_count('dislike')
966
967 # subtitles
968 video_subtitles = self.extract_subtitles(video_id, video_webpage)
969
970 if self._downloader.params.get('listsubtitles', False):
971 self._list_available_subtitles(video_id, video_webpage)
972 return
973
974 if 'length_seconds' not in video_info:
975 self._downloader.report_warning('unable to extract video duration')
976 video_duration = None
977 else:
978 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
979
980 # annotations
981 video_annotations = None
982 if self._downloader.params.get('writeannotations', False):
983 video_annotations = self._extract_annotations(video_id)
984
985 def _map_to_format_list(urlmap):
986 formats = []
987 for itag, video_real_url in urlmap.items():
988 dct = {
989 'format_id': itag,
990 'url': video_real_url,
991 'player_url': player_url,
992 }
993 if itag in self._formats:
994 dct.update(self._formats[itag])
995 formats.append(dct)
996 return formats
997
998 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
999 self.report_rtmp_download()
1000 formats = [{
1001 'format_id': '_rtmp',
1002 'protocol': 'rtmp',
1003 'url': video_info['conn'][0],
1004 'player_url': player_url,
1005 }]
1006 elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
1007 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
1008 if 'rtmpe%3Dyes' in encoded_url_map:
1009 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1010 url_map = {}
1011 for url_data_str in encoded_url_map.split(','):
1012 url_data = compat_parse_qs(url_data_str)
1013 if 'itag' not in url_data or 'url' not in url_data:
1014 continue
1015 format_id = url_data['itag'][0]
1016 url = url_data['url'][0]
1017
1018 if 'sig' in url_data:
1019 url += '&signature=' + url_data['sig'][0]
1020 elif 's' in url_data:
1021 encrypted_sig = url_data['s'][0]
1022 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
1023
1024 jsplayer_url_json = self._search_regex(
1025 ASSETS_RE,
1026 embed_webpage if age_gate else video_webpage,
1027 'JS player URL (1)', default=None)
1028 if not jsplayer_url_json and not age_gate:
1029 # We need the embed website after all
1030 if embed_webpage is None:
1031 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1032 embed_webpage = self._download_webpage(
1033 embed_url, video_id, 'Downloading embed webpage')
1034 jsplayer_url_json = self._search_regex(
1035 ASSETS_RE, embed_webpage, 'JS player URL')
1036
1037 player_url = json.loads(jsplayer_url_json)
1038 if player_url is None:
1039 player_url_json = self._search_regex(
1040 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
1041 video_webpage, 'age gate player URL')
1042 player_url = json.loads(player_url_json)
1043
1044 if self._downloader.params.get('verbose'):
1045 if player_url is None:
1046 player_version = 'unknown'
1047 player_desc = 'unknown'
1048 else:
1049 if player_url.endswith('swf'):
1050 player_version = self._search_regex(
1051 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
1052 'flash player', fatal=False)
1053 player_desc = 'flash player %s' % player_version
1054 else:
1055 player_version = self._search_regex(
1056 r'html5player-([^/]+?)(?:/html5player)?\.js',
1057 player_url,
1058 'html5 player', fatal=False)
1059 player_desc = 'html5 player %s' % player_version
1060
1061 parts_sizes = self._signature_cache_id(encrypted_sig)
1062 self.to_screen('{%s} signature length %s, %s' %
1063 (format_id, parts_sizes, player_desc))
1064
1065 signature = self._decrypt_signature(
1066 encrypted_sig, video_id, player_url, age_gate)
1067 url += '&signature=' + signature
1068 if 'ratebypass' not in url:
1069 url += '&ratebypass=yes'
1070 url_map[format_id] = url
1071 formats = _map_to_format_list(url_map)
1072 elif video_info.get('hlsvp'):
1073 manifest_url = video_info['hlsvp'][0]
1074 url_map = self._extract_from_m3u8(manifest_url, video_id)
1075 formats = _map_to_format_list(url_map)
1076 else:
1077 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1078
1079 # Look for the DASH manifest
1080 if self._downloader.params.get('youtube_include_dash_manifest', True):
1081 dash_mpd = video_info.get('dashmpd')
1082 if dash_mpd:
1083 dash_manifest_url = dash_mpd[0]
1084 try:
1085 dash_formats = self._parse_dash_manifest(
1086 video_id, dash_manifest_url, player_url, age_gate)
1087 except (ExtractorError, KeyError) as e:
1088 self.report_warning(
1089 'Skipping DASH manifest: %r' % e, video_id)
1090 else:
1091 # Hide the formats we found through non-DASH
1092 dash_keys = set(df['format_id'] for df in dash_formats)
1093 for f in formats:
1094 if f['format_id'] in dash_keys:
1095 f['format_id'] = 'nondash-%s' % f['format_id']
1096 f['preference'] = f.get('preference', 0) - 10000
1097 formats.extend(dash_formats)
1098
1099 # Check for malformed aspect ratio
1100 stretched_m = re.search(
1101 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
1102 video_webpage)
1103 if stretched_m:
1104 ratio = float(stretched_m.group('w')) / float(stretched_m.group('h'))
1105 for f in formats:
1106 if f.get('vcodec') != 'none':
1107 f['stretched_ratio'] = ratio
1108
1109 self._sort_formats(formats)
1110
1111 return {
1112 'id': video_id,
1113 'uploader': video_uploader,
1114 'uploader_id': video_uploader_id,
1115 'upload_date': upload_date,
1116 'title': video_title,
1117 'thumbnail': video_thumbnail,
1118 'description': video_description,
1119 'categories': video_categories,
1120 'subtitles': video_subtitles,
1121 'duration': video_duration,
1122 'age_limit': 18 if age_gate else 0,
1123 'annotations': video_annotations,
1124 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1125 'view_count': view_count,
1126 'like_count': like_count,
1127 'dislike_count': dislike_count,
1128 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
1129 'formats': formats,
1130 }
1131
1132
1133 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1134 IE_DESC = 'YouTube.com playlists'
1135 _VALID_URL = r"""(?x)(?:
1136 (?:https?://)?
1137 (?:\w+\.)?
1138 youtube\.com/
1139 (?:
1140 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
1141 \? (?:.*?&)*? (?:p|a|list)=
1142 | p/
1143 )
1144 (
1145 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1146 # Top tracks, they can also include dots
1147 |(?:MC)[\w\.]*
1148 )
1149 .*
1150 |
1151 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1152 )"""
1153 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1154 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1155 IE_NAME = 'youtube:playlist'
1156 _TESTS = [{
1157 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1158 'info_dict': {
1159 'title': 'ytdl test PL',
1160 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1161 },
1162 'playlist_count': 3,
1163 }, {
1164 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1165 'info_dict': {
1166 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1167 'title': 'YDL_Empty_List',
1168 },
1169 'playlist_count': 0,
1170 }, {
1171 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1172 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1173 'info_dict': {
1174 'title': '29C3: Not my department',
1175 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1176 },
1177 'playlist_count': 95,
1178 }, {
1179 'note': 'issue #673',
1180 'url': 'PLBB231211A4F62143',
1181 'info_dict': {
1182 'title': '[OLD]Team Fortress 2 (Class-based LP)',
1183 'id': 'PLBB231211A4F62143',
1184 },
1185 'playlist_mincount': 26,
1186 }, {
1187 'note': 'Large playlist',
1188 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1189 'info_dict': {
1190 'title': 'Uploads from Cauchemar',
1191 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
1192 },
1193 'playlist_mincount': 799,
1194 }, {
1195 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1196 'info_dict': {
1197 'title': 'YDL_safe_search',
1198 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1199 },
1200 'playlist_count': 2,
1201 }, {
1202 'note': 'embedded',
1203 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1204 'playlist_count': 4,
1205 'info_dict': {
1206 'title': 'JODA15',
1207 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1208 }
1209 }, {
1210 'note': 'Embedded SWF player',
1211 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1212 'playlist_count': 4,
1213 'info_dict': {
1214 'title': 'JODA7',
1215 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
1216 }
1217 }, {
1218 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
1219 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
1220 'info_dict': {
1221 'title': 'Uploads from Interstellar Movie',
1222 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
1223 },
1224 'playlist_mincout': 21,
1225 }]
1226
1227 def _real_initialize(self):
1228 self._login()
1229
1230 def _ids_to_results(self, ids):
1231 return [
1232 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1233 for vid_id in ids]
1234
1235 def _extract_mix(self, playlist_id):
1236 # The mixes are generated from a a single video
1237 # the id of the playlist is just 'RD' + video_id
1238 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1239 webpage = self._download_webpage(
1240 url, playlist_id, 'Downloading Youtube mix')
1241 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1242 title_span = (
1243 search_title('playlist-title') or
1244 search_title('title long-title') or
1245 search_title('title'))
1246 title = clean_html(title_span)
1247 ids = orderedSet(re.findall(
1248 r'''(?xs)data-video-username=".*?".*?
1249 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1250 webpage))
1251 url_results = self._ids_to_results(ids)
1252
1253 return self.playlist_result(url_results, playlist_id, title)
1254
1255 def _real_extract(self, url):
1256 # Extract playlist id
1257 mobj = re.match(self._VALID_URL, url)
1258 if mobj is None:
1259 raise ExtractorError('Invalid URL: %s' % url)
1260 playlist_id = mobj.group(1) or mobj.group(2)
1261
1262 # Check if it's a video-specific URL
1263 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1264 if 'v' in query_dict:
1265 video_id = query_dict['v'][0]
1266 if self._downloader.params.get('noplaylist'):
1267 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1268 return self.url_result(video_id, 'Youtube', video_id=video_id)
1269 else:
1270 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1271
1272 if playlist_id.startswith('RD'):
1273 # Mixes require a custom extraction process
1274 return self._extract_mix(playlist_id)
1275
1276 url = self._TEMPLATE_URL % playlist_id
1277 page = self._download_webpage(url, playlist_id)
1278 more_widget_html = content_html = page
1279
1280 # Check if the playlist exists or is private
1281 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1282 raise ExtractorError(
1283 'The playlist doesn\'t exist or is private, use --username or '
1284 '--netrc to access it.',
1285 expected=True)
1286
1287 # Extract the video ids from the playlist pages
1288 ids = []
1289
1290 for page_num in itertools.count(1):
1291 matches = re.finditer(self._VIDEO_RE, content_html)
1292 # We remove the duplicates and the link with index 0
1293 # (it's not the first video of the playlist)
1294 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1295 ids.extend(new_ids)
1296
1297 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1298 if not mobj:
1299 break
1300
1301 more = self._download_json(
1302 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1303 'Downloading page #%s' % page_num,
1304 transform_source=uppercase_escape)
1305 content_html = more['content_html']
1306 if not content_html.strip():
1307 # Some webpages show a "Load more" button but they don't
1308 # have more videos
1309 break
1310 more_widget_html = more['load_more_widget_html']
1311
1312 playlist_title = self._html_search_regex(
1313 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1314 page, 'title')
1315
1316 url_results = self._ids_to_results(ids)
1317 return self.playlist_result(url_results, playlist_id, playlist_title)
1318
1319
1320 class YoutubeChannelIE(InfoExtractor):
1321 IE_DESC = 'YouTube.com channels'
1322 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
1323 IE_NAME = 'youtube:channel'
1324 _TESTS = [{
1325 'note': 'paginated channel',
1326 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1327 'playlist_mincount': 91,
1328 'info_dict': {
1329 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
1330 }
1331 }]
1332
1333 def extract_videos_from_page(self, page):
1334 ids_in_page = []
1335 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1336 if mobj.group(1) not in ids_in_page:
1337 ids_in_page.append(mobj.group(1))
1338 return ids_in_page
1339
1340 def _real_extract(self, url):
1341 channel_id = self._match_id(url)
1342
1343 video_ids = []
1344 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1345 channel_page = self._download_webpage(url, channel_id)
1346 autogenerated = re.search(r'''(?x)
1347 class="[^"]*?(?:
1348 channel-header-autogenerated-label|
1349 yt-channel-title-autogenerated
1350 )[^"]*"''', channel_page) is not None
1351
1352 if autogenerated:
1353 # The videos are contained in a single page
1354 # the ajax pages can't be used, they are empty
1355 video_ids = self.extract_videos_from_page(channel_page)
1356 entries = [
1357 self.url_result(video_id, 'Youtube', video_id=video_id)
1358 for video_id in video_ids]
1359 return self.playlist_result(entries, channel_id)
1360
1361 def _entries():
1362 more_widget_html = content_html = channel_page
1363 for pagenum in itertools.count(1):
1364
1365 ids_in_page = self.extract_videos_from_page(content_html)
1366 for video_id in ids_in_page:
1367 yield self.url_result(
1368 video_id, 'Youtube', video_id=video_id)
1369
1370 mobj = re.search(
1371 r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
1372 more_widget_html)
1373 if not mobj:
1374 break
1375
1376 more = self._download_json(
1377 'https://youtube.com/%s' % mobj.group('more'), channel_id,
1378 'Downloading page #%s' % (pagenum + 1),
1379 transform_source=uppercase_escape)
1380 content_html = more['content_html']
1381 more_widget_html = more['load_more_widget_html']
1382
1383 return self.playlist_result(_entries(), channel_id)
1384
1385
1386 class YoutubeUserIE(InfoExtractor):
1387 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
1388 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
1389 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1390 _GDATA_PAGE_SIZE = 50
1391 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1392 IE_NAME = 'youtube:user'
1393
1394 _TESTS = [{
1395 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1396 'playlist_mincount': 320,
1397 'info_dict': {
1398 'title': 'TheLinuxFoundation',
1399 }
1400 }, {
1401 'url': 'ytuser:phihag',
1402 'only_matching': True,
1403 }]
1404
1405 @classmethod
1406 def suitable(cls, url):
1407 # Don't return True if the url can be extracted with other youtube
1408 # extractor, the regex would is too permissive and it would match.
1409 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1410 if any(ie.suitable(url) for ie in other_ies):
1411 return False
1412 else:
1413 return super(YoutubeUserIE, cls).suitable(url)
1414
1415 def _real_extract(self, url):
1416 username = self._match_id(url)
1417
1418 # Download video ids using YouTube Data API. Result size per
1419 # query is limited (currently to 50 videos) so we need to query
1420 # page by page until there are no video ids - it means we got
1421 # all of them.
1422
1423 def download_page(pagenum):
1424 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1425
1426 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1427 page = self._download_webpage(
1428 gdata_url, username,
1429 'Downloading video ids from %d to %d' % (
1430 start_index, start_index + self._GDATA_PAGE_SIZE))
1431
1432 try:
1433 response = json.loads(page)
1434 except ValueError as err:
1435 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
1436 if 'entry' not in response['feed']:
1437 return
1438
1439 # Extract video identifiers
1440 entries = response['feed']['entry']
1441 for entry in entries:
1442 title = entry['title']['$t']
1443 video_id = entry['id']['$t'].split('/')[-1]
1444 yield {
1445 '_type': 'url',
1446 'url': video_id,
1447 'ie_key': 'Youtube',
1448 'id': video_id,
1449 'title': title,
1450 }
1451 url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
1452
1453 return self.playlist_result(url_results, playlist_title=username)
1454
1455
1456 class YoutubeSearchIE(SearchInfoExtractor):
1457 IE_DESC = 'YouTube.com searches'
1458 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1459 _MAX_RESULTS = 1000
1460 IE_NAME = 'youtube:search'
1461 _SEARCH_KEY = 'ytsearch'
1462
1463 def _get_n_results(self, query, n):
1464 """Get a specified number of results for a query"""
1465
1466 video_ids = []
1467 pagenum = 0
1468 limit = n
1469 PAGE_SIZE = 50
1470
1471 while (PAGE_SIZE * pagenum) < limit:
1472 result_url = self._API_URL % (
1473 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1474 (PAGE_SIZE * pagenum) + 1)
1475 data_json = self._download_webpage(
1476 result_url, video_id='query "%s"' % query,
1477 note='Downloading page %s' % (pagenum + 1),
1478 errnote='Unable to download API page')
1479 data = json.loads(data_json)
1480 api_response = data['data']
1481
1482 if 'items' not in api_response:
1483 raise ExtractorError(
1484 '[youtube] No video results', expected=True)
1485
1486 new_ids = list(video['id'] for video in api_response['items'])
1487 video_ids += new_ids
1488
1489 limit = min(n, api_response['totalItems'])
1490 pagenum += 1
1491
1492 if len(video_ids) > n:
1493 video_ids = video_ids[:n]
1494 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1495 for video_id in video_ids]
1496 return self.playlist_result(videos, query)
1497
1498
1499 class YoutubeSearchDateIE(YoutubeSearchIE):
1500 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1501 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1502 _SEARCH_KEY = 'ytsearchdate'
1503 IE_DESC = 'YouTube.com searches, newest videos first'
1504
1505
1506 class YoutubeSearchURLIE(InfoExtractor):
1507 IE_DESC = 'YouTube.com search URLs'
1508 IE_NAME = 'youtube:search_url'
1509 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1510 _TESTS = [{
1511 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1512 'playlist_mincount': 5,
1513 'info_dict': {
1514 'title': 'youtube-dl test video',
1515 }
1516 }]
1517
1518 def _real_extract(self, url):
1519 mobj = re.match(self._VALID_URL, url)
1520 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1521
1522 webpage = self._download_webpage(url, query)
1523 result_code = self._search_regex(
1524 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
1525
1526 part_codes = re.findall(
1527 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1528 entries = []
1529 for part_code in part_codes:
1530 part_title = self._html_search_regex(
1531 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1532 part_url_snippet = self._html_search_regex(
1533 r'(?s)href="([^"]+)"', part_code, 'item URL')
1534 part_url = compat_urlparse.urljoin(
1535 'https://www.youtube.com/', part_url_snippet)
1536 entries.append({
1537 '_type': 'url',
1538 'url': part_url,
1539 'title': part_title,
1540 })
1541
1542 return {
1543 '_type': 'playlist',
1544 'entries': entries,
1545 'title': query,
1546 }
1547
1548
1549 class YoutubeShowIE(InfoExtractor):
1550 IE_DESC = 'YouTube.com (multi-season) shows'
1551 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
1552 IE_NAME = 'youtube:show'
1553 _TESTS = [{
1554 'url': 'http://www.youtube.com/show/airdisasters',
1555 'playlist_mincount': 3,
1556 'info_dict': {
1557 'id': 'airdisasters',
1558 'title': 'Air Disasters',
1559 }
1560 }]
1561
1562 def _real_extract(self, url):
1563 mobj = re.match(self._VALID_URL, url)
1564 playlist_id = mobj.group('id')
1565 webpage = self._download_webpage(
1566 url, playlist_id, 'Downloading show webpage')
1567 # There's one playlist for each season of the show
1568 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1569 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1570 entries = [
1571 self.url_result(
1572 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1573 for season in m_seasons
1574 ]
1575 title = self._og_search_title(webpage, fatal=False)
1576
1577 return {
1578 '_type': 'playlist',
1579 'id': playlist_id,
1580 'title': title,
1581 'entries': entries,
1582 }
1583
1584
1585 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1586 """
1587 Base class for extractors that fetch info from
1588 http://www.youtube.com/feed_ajax
1589 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1590 """
1591 _LOGIN_REQUIRED = True
1592 # use action_load_personal_feed instead of action_load_system_feed
1593 _PERSONAL_FEED = False
1594
1595 @property
1596 def _FEED_TEMPLATE(self):
1597 action = 'action_load_system_feed'
1598 if self._PERSONAL_FEED:
1599 action = 'action_load_personal_feed'
1600 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1601
1602 @property
1603 def IE_NAME(self):
1604 return 'youtube:%s' % self._FEED_NAME
1605
1606 def _real_initialize(self):
1607 self._login()
1608
1609 def _real_extract(self, url):
1610 feed_entries = []
1611 paging = 0
1612 for i in itertools.count(1):
1613 info = self._download_json(
1614 self._FEED_TEMPLATE % paging,
1615 '%s feed' % self._FEED_NAME,
1616 'Downloading page %s' % i,
1617 transform_source=uppercase_escape)
1618 feed_html = info.get('feed_html') or info.get('content_html')
1619 load_more_widget_html = info.get('load_more_widget_html') or feed_html
1620 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1621 ids = orderedSet(m.group(1) for m in m_ids)
1622 feed_entries.extend(
1623 self.url_result(video_id, 'Youtube', video_id=video_id)
1624 for video_id in ids)
1625 mobj = re.search(
1626 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1627 load_more_widget_html)
1628 if mobj is None:
1629 break
1630 paging = mobj.group('paging')
1631 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1632
1633
1634 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1635 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
1636 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1637 _FEED_NAME = 'recommended'
1638 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1639
1640
1641 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1642 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
1643 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1644 _FEED_NAME = 'watch_later'
1645 _PLAYLIST_TITLE = 'Youtube Watch Later'
1646 _PERSONAL_FEED = True
1647
1648
1649 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1650 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
1651 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1652 _FEED_NAME = 'history'
1653 _PERSONAL_FEED = True
1654 _PLAYLIST_TITLE = 'Youtube Watch History'
1655
1656
1657 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1658 IE_NAME = 'youtube:favorites'
1659 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
1660 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1661 _LOGIN_REQUIRED = True
1662
1663 def _real_extract(self, url):
1664 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1665 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
1666 return self.url_result(playlist_id, 'YoutubePlaylist')
1667
1668
1669 class YoutubeSubscriptionsIE(YoutubePlaylistIE):
1670 IE_NAME = 'youtube:subscriptions'
1671 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1672 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1673 _TESTS = []
1674
1675 def _real_extract(self, url):
1676 title = 'Youtube Subscriptions'
1677 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1678
1679 # The extraction process is the same as for playlists, but the regex
1680 # for the video ids doesn't contain an index
1681 ids = []
1682 more_widget_html = content_html = page
1683
1684 for page_num in itertools.count(1):
1685 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1686 new_ids = orderedSet(matches)
1687 ids.extend(new_ids)
1688
1689 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1690 if not mobj:
1691 break
1692
1693 more = self._download_json(
1694 'https://youtube.com/%s' % mobj.group('more'), title,
1695 'Downloading page #%s' % page_num,
1696 transform_source=uppercase_escape)
1697 content_html = more['content_html']
1698 more_widget_html = more['load_more_widget_html']
1699
1700 return {
1701 '_type': 'playlist',
1702 'title': title,
1703 'entries': self._ids_to_results(ids),
1704 }
1705
1706
1707 class YoutubeTruncatedURLIE(InfoExtractor):
1708 IE_NAME = 'youtube:truncated_url'
1709 IE_DESC = False # Do not list
1710 _VALID_URL = r'''(?x)
1711 (?:https?://)?
1712 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
1713 (?:watch\?(?:
1714 feature=[a-z_]+|
1715 annotation_id=annotation_[^&]+|
1716 x-yt-cl=[0-9]+|
1717 hl=[^&]*|
1718 )?
1719 |
1720 attribution_link\?a=[^&]+
1721 )
1722 $
1723 '''
1724
1725 _TESTS = [{
1726 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1727 'only_matching': True,
1728 }, {
1729 'url': 'http://www.youtube.com/watch?',
1730 'only_matching': True,
1731 }, {
1732 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
1733 'only_matching': True,
1734 }, {
1735 'url': 'https://www.youtube.com/watch?feature=foo',
1736 'only_matching': True,
1737 }, {
1738 'url': 'https://www.youtube.com/watch?hl=en-GB',
1739 'only_matching': True,
1740 }]
1741
1742 def _real_extract(self, url):
1743 raise ExtractorError(
1744 'Did you forget to quote the URL? Remember that & is a meta '
1745 'character in most shells, so you want to put the URL in quotes, '
1746 'like youtube-dl '
1747 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1748 ' or simply youtube-dl BaW_jenozKc .',
1749 expected=True)
1750
1751
1752 class YoutubeTruncatedIDIE(InfoExtractor):
1753 IE_NAME = 'youtube:truncated_id'
1754 IE_DESC = False # Do not list
1755 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
1756
1757 _TESTS = [{
1758 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
1759 'only_matching': True,
1760 }]
1761
1762 def _real_extract(self, url):
1763 video_id = self._match_id(url)
1764 raise ExtractorError(
1765 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
1766 expected=True)