]> jfr.im git - yt-dlp.git/blame_incremental - youtube_dl/extractor/youtube.py
[youtube] Skip download for multiple v= test
[yt-dlp.git] / youtube_dl / extractor / youtube.py
... / ...
CommitLineData
1# coding: utf-8
2
3from __future__ import unicode_literals
4
5
6import itertools
7import json
8import os.path
9import re
10import time
11import traceback
12
13from .common import InfoExtractor, SearchInfoExtractor
14from ..jsinterp import JSInterpreter
15from ..swfinterp import SWFInterpreter
16from ..compat import (
17 compat_chr,
18 compat_parse_qs,
19 compat_urllib_parse,
20 compat_urllib_parse_unquote,
21 compat_urllib_parse_unquote_plus,
22 compat_urllib_parse_urlparse,
23 compat_urllib_request,
24 compat_urlparse,
25 compat_str,
26)
27from ..utils import (
28 clean_html,
29 ExtractorError,
30 float_or_none,
31 get_element_by_attribute,
32 get_element_by_id,
33 int_or_none,
34 orderedSet,
35 parse_duration,
36 smuggle_url,
37 str_to_int,
38 unescapeHTML,
39 unified_strdate,
40 unsmuggle_url,
41 uppercase_escape,
42 ISO3166Utils,
43)
44
45
46class YoutubeBaseInfoExtractor(InfoExtractor):
47 """Provide base functions for Youtube extractors"""
48 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
49 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
50 _NETRC_MACHINE = 'youtube'
51 # If True it will raise an error if no login info is provided
52 _LOGIN_REQUIRED = False
53
54 def _set_language(self):
55 self._set_cookie(
56 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
57 # YouTube sets the expire time to about two months
58 expire_time=time.time() + 2 * 30 * 24 * 3600)
59
60 def _ids_to_results(self, ids):
61 return [
62 self.url_result(vid_id, 'Youtube', video_id=vid_id)
63 for vid_id in ids]
64
65 def _login(self):
66 """
67 Attempt to log in to YouTube.
68 True is returned if successful or skipped.
69 False is returned if login failed.
70
71 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
72 """
73 (username, password) = self._get_login_info()
74 # No authentication to be performed
75 if username is None:
76 if self._LOGIN_REQUIRED:
77 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
78 return True
79
80 login_page = self._download_webpage(
81 self._LOGIN_URL, None,
82 note='Downloading login page',
83 errnote='unable to fetch login page', fatal=False)
84 if login_page is False:
85 return
86
87 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
88 login_page, 'Login GALX parameter')
89
90 # Log in
91 login_form_strs = {
92 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
93 'Email': username,
94 'GALX': galx,
95 'Passwd': password,
96
97 'PersistentCookie': 'yes',
98 '_utf8': '霱',
99 'bgresponse': 'js_disabled',
100 'checkConnection': '',
101 'checkedDomains': 'youtube',
102 'dnConn': '',
103 'pstMsg': '0',
104 'rmShown': '1',
105 'secTok': '',
106 'signIn': 'Sign in',
107 'timeStmp': '',
108 'service': 'youtube',
109 'uilel': '3',
110 'hl': 'en_US',
111 }
112
113 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
114 # chokes on unicode
115 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
116 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
117
118 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
119 login_results = self._download_webpage(
120 req, None,
121 note='Logging in', errnote='unable to log in', fatal=False)
122 if login_results is False:
123 return False
124
125 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
126 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
127
128 # Two-Factor
129 # TODO add SMS and phone call support - these require making a request and then prompting the user
130
131 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
132 tfa_code = self._get_tfa_info()
133
134 if tfa_code is None:
135 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
136 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
137 return False
138
139 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
140
141 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
142 if match is None:
143 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
144 secTok = match.group(1)
145 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
146 if match is None:
147 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
148 timeStmp = match.group(1)
149
150 tfa_form_strs = {
151 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
152 'smsToken': '',
153 'smsUserPin': tfa_code,
154 'smsVerifyPin': 'Verify',
155
156 'PersistentCookie': 'yes',
157 'checkConnection': '',
158 'checkedDomains': 'youtube',
159 'pstMsg': '1',
160 'secTok': secTok,
161 'timeStmp': timeStmp,
162 'service': 'youtube',
163 'hl': 'en_US',
164 }
165 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
166 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
167
168 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
169 tfa_results = self._download_webpage(
170 tfa_req, None,
171 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
172
173 if tfa_results is False:
174 return False
175
176 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
177 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
178 return False
179 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
180 self._downloader.report_warning('unable to log in - did the page structure change?')
181 return False
182 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
183 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
184 return False
185
186 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
187 self._downloader.report_warning('unable to log in: bad username or password')
188 return False
189 return True
190
191 def _real_initialize(self):
192 if self._downloader is None:
193 return
194 self._set_language()
195 if not self._login():
196 return
197
198
199class YoutubeIE(YoutubeBaseInfoExtractor):
200 IE_DESC = 'YouTube.com'
201 _VALID_URL = r"""(?x)^
202 (
203 (?:https?://|//) # http(s):// or protocol-independent URL
204 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
205 (?:www\.)?deturl\.com/www\.youtube\.com/|
206 (?:www\.)?pwnyoutube\.com/|
207 (?:www\.)?yourepeat\.com/|
208 tube\.majestyc\.net/|
209 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
210 (?:.*?\#/)? # handle anchor (#/) redirect urls
211 (?: # the various things that can precede the ID:
212 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
213 |(?: # or the v= param in all its forms
214 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
215 (?:\?|\#!?) # the params delimiter ? or # or #!
216 (?:.*?&)?? # any other preceding param (like /?s=tuff&v=xxxx)
217 v=
218 )
219 ))
220 |youtu\.be/ # just youtu.be/xxxx
221 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
222 )
223 )? # all until now is optional -> you can pass the naked ID
224 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
225 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
226 (?(1).+)? # if we found the ID, everything can follow
227 $"""
228 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
229 _formats = {
230 '5': {'ext': 'flv', 'width': 400, 'height': 240},
231 '6': {'ext': 'flv', 'width': 450, 'height': 270},
232 '13': {'ext': '3gp'},
233 '17': {'ext': '3gp', 'width': 176, 'height': 144},
234 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
235 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
236 '34': {'ext': 'flv', 'width': 640, 'height': 360},
237 '35': {'ext': 'flv', 'width': 854, 'height': 480},
238 '36': {'ext': '3gp', 'width': 320, 'height': 240},
239 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
240 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
241 '43': {'ext': 'webm', 'width': 640, 'height': 360},
242 '44': {'ext': 'webm', 'width': 854, 'height': 480},
243 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
244 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
245 '59': {'ext': 'mp4', 'width': 854, 'height': 480},
246 '78': {'ext': 'mp4', 'width': 854, 'height': 480},
247
248
249 # 3d videos
250 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
251 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
252 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
253 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
254 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
255 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
256 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
257
258 # Apple HTTP Live Streaming
259 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
260 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
261 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
262 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
263 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
264 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
265 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
266
267 # DASH mp4 video
268 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
269 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
270 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
271 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
272 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
273 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
274 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
275 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
276 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
277 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
278 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
279
280 # Dash mp4 audio
281 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
282 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
283 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
284
285 # Dash webm
286 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
287 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
288 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
289 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
290 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
291 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
292 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'vp9'},
293 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
294 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
295 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
296 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
297 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
298 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
299 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
300 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
301 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
302 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
303 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
304 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
305 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'vp9'},
306 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
307
308 # Dash webm audio
309 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
310 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
311
312 # Dash webm audio with opus inside
313 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
314 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
315 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
316
317 # RTMP (unnamed)
318 '_rtmp': {'protocol': 'rtmp'},
319 }
320
321 IE_NAME = 'youtube'
322 _TESTS = [
323 {
324 'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&t=1s&end=9',
325 'info_dict': {
326 'id': 'BaW_jenozKc',
327 'ext': 'mp4',
328 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
329 'uploader': 'Philipp Hagemeister',
330 'uploader_id': 'phihag',
331 'upload_date': '20121002',
332 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
333 'categories': ['Science & Technology'],
334 'tags': ['youtube-dl'],
335 'like_count': int,
336 'dislike_count': int,
337 'start_time': 1,
338 'end_time': 9,
339 }
340 },
341 {
342 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
343 'note': 'Test generic use_cipher_signature video (#897)',
344 'info_dict': {
345 'id': 'UxxajLWwzqY',
346 'ext': 'mp4',
347 'upload_date': '20120506',
348 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
349 'description': 'md5:782e8651347686cba06e58f71ab51773',
350 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
351 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
352 'iconic ep', 'iconic', 'love', 'it'],
353 'uploader': 'Icona Pop',
354 'uploader_id': 'IconaPop',
355 }
356 },
357 {
358 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
359 'note': 'Test VEVO video with age protection (#956)',
360 'info_dict': {
361 'id': '07FYdnEawAQ',
362 'ext': 'mp4',
363 'upload_date': '20130703',
364 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
365 'description': 'md5:64249768eec3bc4276236606ea996373',
366 'uploader': 'justintimberlakeVEVO',
367 'uploader_id': 'justintimberlakeVEVO',
368 }
369 },
370 {
371 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
372 'note': 'Embed-only video (#1746)',
373 'info_dict': {
374 'id': 'yZIXLfi8CZQ',
375 'ext': 'mp4',
376 'upload_date': '20120608',
377 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
378 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
379 'uploader': 'SET India',
380 'uploader_id': 'setindia'
381 }
382 },
383 {
384 'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&v=UxxajLWwzqY',
385 'note': 'Use the first video ID in the URL',
386 'info_dict': {
387 'id': 'BaW_jenozKc',
388 'ext': 'mp4',
389 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
390 'uploader': 'Philipp Hagemeister',
391 'uploader_id': 'phihag',
392 'upload_date': '20121002',
393 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
394 'categories': ['Science & Technology'],
395 'tags': ['youtube-dl'],
396 'like_count': int,
397 'dislike_count': int,
398 },
399 'params': {
400 'skip_download': True,
401 },
402 },
403 {
404 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
405 'note': '256k DASH audio (format 141) via DASH manifest',
406 'info_dict': {
407 'id': 'a9LDPn-MO4I',
408 'ext': 'm4a',
409 'upload_date': '20121002',
410 'uploader_id': '8KVIDEO',
411 'description': '',
412 'uploader': '8KVIDEO',
413 'title': 'UHDTV TEST 8K VIDEO.mp4'
414 },
415 'params': {
416 'youtube_include_dash_manifest': True,
417 'format': '141',
418 },
419 },
420 # DASH manifest with encrypted signature
421 {
422 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
423 'info_dict': {
424 'id': 'IB3lcPjvWLA',
425 'ext': 'm4a',
426 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
427 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
428 'uploader': 'AfrojackVEVO',
429 'uploader_id': 'AfrojackVEVO',
430 'upload_date': '20131011',
431 },
432 'params': {
433 'youtube_include_dash_manifest': True,
434 'format': '141',
435 },
436 },
437 # JS player signature function name containing $
438 {
439 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
440 'info_dict': {
441 'id': 'nfWlot6h_JM',
442 'ext': 'm4a',
443 'title': 'Taylor Swift - Shake It Off',
444 'description': 'md5:2acfda1b285bdd478ccec22f9918199d',
445 'uploader': 'TaylorSwiftVEVO',
446 'uploader_id': 'TaylorSwiftVEVO',
447 'upload_date': '20140818',
448 },
449 'params': {
450 'youtube_include_dash_manifest': True,
451 'format': '141',
452 },
453 },
454 # Controversy video
455 {
456 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
457 'info_dict': {
458 'id': 'T4XJQO3qol8',
459 'ext': 'mp4',
460 'upload_date': '20100909',
461 'uploader': 'The Amazing Atheist',
462 'uploader_id': 'TheAmazingAtheist',
463 'title': 'Burning Everyone\'s Koran',
464 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
465 }
466 },
467 # Normal age-gate video (No vevo, embed allowed)
468 {
469 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
470 'info_dict': {
471 'id': 'HtVdAasjOgU',
472 'ext': 'mp4',
473 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
474 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
475 'uploader': 'The Witcher',
476 'uploader_id': 'WitcherGame',
477 'upload_date': '20140605',
478 },
479 },
480 # Age-gate video with encrypted signature
481 {
482 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',
483 'info_dict': {
484 'id': '6kLq3WMV1nU',
485 'ext': 'mp4',
486 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
487 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
488 'uploader': 'LloydVEVO',
489 'uploader_id': 'LloydVEVO',
490 'upload_date': '20110629',
491 },
492 },
493 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
494 {
495 'url': '__2ABJjxzNo',
496 'info_dict': {
497 'id': '__2ABJjxzNo',
498 'ext': 'mp4',
499 'upload_date': '20100430',
500 'uploader_id': 'deadmau5',
501 'description': 'md5:12c56784b8032162bb936a5f76d55360',
502 'uploader': 'deadmau5',
503 'title': 'Deadmau5 - Some Chords (HD)',
504 },
505 'expected_warnings': [
506 'DASH manifest missing',
507 ]
508 },
509 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
510 {
511 'url': 'lqQg6PlCWgI',
512 'info_dict': {
513 'id': 'lqQg6PlCWgI',
514 'ext': 'mp4',
515 'upload_date': '20120731',
516 'uploader_id': 'olympic',
517 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
518 'uploader': 'Olympics',
519 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
520 },
521 'params': {
522 'skip_download': 'requires avconv',
523 }
524 },
525 # Non-square pixels
526 {
527 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
528 'info_dict': {
529 'id': '_b-2C3KPAM0',
530 'ext': 'mp4',
531 'stretched_ratio': 16 / 9.,
532 'upload_date': '20110310',
533 'uploader_id': 'AllenMeow',
534 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
535 'uploader': '孫艾倫',
536 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
537 },
538 },
539 # url_encoded_fmt_stream_map is empty string
540 {
541 'url': 'qEJwOuvDf7I',
542 'info_dict': {
543 'id': 'qEJwOuvDf7I',
544 'ext': 'mp4',
545 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
546 'description': '',
547 'upload_date': '20150404',
548 'uploader_id': 'spbelect',
549 'uploader': 'Наблюдатели Петербурга',
550 },
551 'params': {
552 'skip_download': 'requires avconv',
553 }
554 },
555 # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)
556 {
557 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
558 'info_dict': {
559 'id': 'FIl7x6_3R5Y',
560 'ext': 'mp4',
561 'title': 'md5:7b81415841e02ecd4313668cde88737a',
562 'description': 'md5:116377fd2963b81ec4ce64b542173306',
563 'upload_date': '20150625',
564 'uploader_id': 'dorappi2000',
565 'uploader': 'dorappi2000',
566 'formats': 'mincount:33',
567 },
568 },
569 # DASH manifest with segment_list
570 {
571 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
572 'md5': '8ce563a1d667b599d21064e982ab9e31',
573 'info_dict': {
574 'id': 'CsmdDsKjzN8',
575 'ext': 'mp4',
576 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
577 'uploader': 'Airtek',
578 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
579 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
580 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
581 },
582 'params': {
583 'youtube_include_dash_manifest': True,
584 'format': '135', # bestvideo
585 }
586 },
587 {
588 # Multifeed videos (multiple cameras), URL is for Main Camera
589 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
590 'info_dict': {
591 'id': 'jqWvoWXjCVs',
592 'title': 'teamPGP: Rocket League Noob Stream',
593 'description': 'md5:dc7872fb300e143831327f1bae3af010',
594 },
595 'playlist': [{
596 'info_dict': {
597 'id': 'jqWvoWXjCVs',
598 'ext': 'mp4',
599 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
600 'description': 'md5:dc7872fb300e143831327f1bae3af010',
601 'upload_date': '20150721',
602 'uploader': 'Beer Games Beer',
603 'uploader_id': 'beergamesbeer',
604 },
605 }, {
606 'info_dict': {
607 'id': '6h8e8xoXJzg',
608 'ext': 'mp4',
609 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
610 'description': 'md5:dc7872fb300e143831327f1bae3af010',
611 'upload_date': '20150721',
612 'uploader': 'Beer Games Beer',
613 'uploader_id': 'beergamesbeer',
614 },
615 }, {
616 'info_dict': {
617 'id': 'PUOgX5z9xZw',
618 'ext': 'mp4',
619 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
620 'description': 'md5:dc7872fb300e143831327f1bae3af010',
621 'upload_date': '20150721',
622 'uploader': 'Beer Games Beer',
623 'uploader_id': 'beergamesbeer',
624 },
625 }, {
626 'info_dict': {
627 'id': 'teuwxikvS5k',
628 'ext': 'mp4',
629 'title': 'teamPGP: Rocket League Noob Stream (zim)',
630 'description': 'md5:dc7872fb300e143831327f1bae3af010',
631 'upload_date': '20150721',
632 'uploader': 'Beer Games Beer',
633 'uploader_id': 'beergamesbeer',
634 },
635 }],
636 'params': {
637 'skip_download': True,
638 },
639 }
640 ]
641
642 def __init__(self, *args, **kwargs):
643 super(YoutubeIE, self).__init__(*args, **kwargs)
644 self._player_cache = {}
645
646 def report_video_info_webpage_download(self, video_id):
647 """Report attempt to download video info webpage."""
648 self.to_screen('%s: Downloading video info webpage' % video_id)
649
650 def report_information_extraction(self, video_id):
651 """Report attempt to extract video information."""
652 self.to_screen('%s: Extracting video information' % video_id)
653
654 def report_unavailable_format(self, video_id, format):
655 """Report extracted video URL."""
656 self.to_screen('%s: Format %s not available' % (video_id, format))
657
658 def report_rtmp_download(self):
659 """Indicate the download will use the RTMP protocol."""
660 self.to_screen('RTMP download detected')
661
662 def _signature_cache_id(self, example_sig):
663 """ Return a string representation of a signature """
664 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
665
666 def _extract_signature_function(self, video_id, player_url, example_sig):
667 id_m = re.match(
668 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
669 player_url)
670 if not id_m:
671 raise ExtractorError('Cannot identify player %r' % player_url)
672 player_type = id_m.group('ext')
673 player_id = id_m.group('id')
674
675 # Read from filesystem cache
676 func_id = '%s_%s_%s' % (
677 player_type, player_id, self._signature_cache_id(example_sig))
678 assert os.path.basename(func_id) == func_id
679
680 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
681 if cache_spec is not None:
682 return lambda s: ''.join(s[i] for i in cache_spec)
683
684 download_note = (
685 'Downloading player %s' % player_url
686 if self._downloader.params.get('verbose') else
687 'Downloading %s player %s' % (player_type, player_id)
688 )
689 if player_type == 'js':
690 code = self._download_webpage(
691 player_url, video_id,
692 note=download_note,
693 errnote='Download of %s failed' % player_url)
694 res = self._parse_sig_js(code)
695 elif player_type == 'swf':
696 urlh = self._request_webpage(
697 player_url, video_id,
698 note=download_note,
699 errnote='Download of %s failed' % player_url)
700 code = urlh.read()
701 res = self._parse_sig_swf(code)
702 else:
703 assert False, 'Invalid player type %r' % player_type
704
705 test_string = ''.join(map(compat_chr, range(len(example_sig))))
706 cache_res = res(test_string)
707 cache_spec = [ord(c) for c in cache_res]
708
709 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
710 return res
711
712 def _print_sig_code(self, func, example_sig):
713 def gen_sig_code(idxs):
714 def _genslice(start, end, step):
715 starts = '' if start == 0 else str(start)
716 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
717 steps = '' if step == 1 else (':%d' % step)
718 return 's[%s%s%s]' % (starts, ends, steps)
719
720 step = None
721 # Quelch pyflakes warnings - start will be set when step is set
722 start = '(Never used)'
723 for i, prev in zip(idxs[1:], idxs[:-1]):
724 if step is not None:
725 if i - prev == step:
726 continue
727 yield _genslice(start, prev, step)
728 step = None
729 continue
730 if i - prev in [-1, 1]:
731 step = i - prev
732 start = prev
733 continue
734 else:
735 yield 's[%d]' % prev
736 if step is None:
737 yield 's[%d]' % i
738 else:
739 yield _genslice(start, i, step)
740
741 test_string = ''.join(map(compat_chr, range(len(example_sig))))
742 cache_res = func(test_string)
743 cache_spec = [ord(c) for c in cache_res]
744 expr_code = ' + '.join(gen_sig_code(cache_spec))
745 signature_id_tuple = '(%s)' % (
746 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
747 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
748 ' return %s\n') % (signature_id_tuple, expr_code)
749 self.to_screen('Extracted signature function:\n' + code)
750
751 def _parse_sig_js(self, jscode):
752 funcname = self._search_regex(
753 r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode,
754 'Initial JS player signature function name')
755
756 jsi = JSInterpreter(jscode)
757 initial_function = jsi.extract_function(funcname)
758 return lambda s: initial_function([s])
759
760 def _parse_sig_swf(self, file_contents):
761 swfi = SWFInterpreter(file_contents)
762 TARGET_CLASSNAME = 'SignatureDecipher'
763 searched_class = swfi.extract_class(TARGET_CLASSNAME)
764 initial_function = swfi.extract_function(searched_class, 'decipher')
765 return lambda s: initial_function([s])
766
767 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
768 """Turn the encrypted s field into a working signature"""
769
770 if player_url is None:
771 raise ExtractorError('Cannot decrypt signature without player_url')
772
773 if player_url.startswith('//'):
774 player_url = 'https:' + player_url
775 try:
776 player_id = (player_url, self._signature_cache_id(s))
777 if player_id not in self._player_cache:
778 func = self._extract_signature_function(
779 video_id, player_url, s
780 )
781 self._player_cache[player_id] = func
782 func = self._player_cache[player_id]
783 if self._downloader.params.get('youtube_print_sig_code'):
784 self._print_sig_code(func, s)
785 return func(s)
786 except Exception as e:
787 tb = traceback.format_exc()
788 raise ExtractorError(
789 'Signature extraction failed: ' + tb, cause=e)
790
791 def _get_subtitles(self, video_id, webpage):
792 try:
793 subs_doc = self._download_xml(
794 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
795 video_id, note=False)
796 except ExtractorError as err:
797 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
798 return {}
799
800 sub_lang_list = {}
801 for track in subs_doc.findall('track'):
802 lang = track.attrib['lang_code']
803 if lang in sub_lang_list:
804 continue
805 sub_formats = []
806 for ext in ['sbv', 'vtt', 'srt']:
807 params = compat_urllib_parse.urlencode({
808 'lang': lang,
809 'v': video_id,
810 'fmt': ext,
811 'name': track.attrib['name'].encode('utf-8'),
812 })
813 sub_formats.append({
814 'url': 'https://www.youtube.com/api/timedtext?' + params,
815 'ext': ext,
816 })
817 sub_lang_list[lang] = sub_formats
818 if not sub_lang_list:
819 self._downloader.report_warning('video doesn\'t have subtitles')
820 return {}
821 return sub_lang_list
822
823 def _get_automatic_captions(self, video_id, webpage):
824 """We need the webpage for getting the captions url, pass it as an
825 argument to speed up the process."""
826 self.to_screen('%s: Looking for automatic captions' % video_id)
827 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
828 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
829 if mobj is None:
830 self._downloader.report_warning(err_msg)
831 return {}
832 player_config = json.loads(mobj.group(1))
833 try:
834 args = player_config['args']
835 caption_url = args['ttsurl']
836 timestamp = args['timestamp']
837 # We get the available subtitles
838 list_params = compat_urllib_parse.urlencode({
839 'type': 'list',
840 'tlangs': 1,
841 'asrs': 1,
842 })
843 list_url = caption_url + '&' + list_params
844 caption_list = self._download_xml(list_url, video_id)
845 original_lang_node = caption_list.find('track')
846 if original_lang_node is None:
847 self._downloader.report_warning('Video doesn\'t have automatic captions')
848 return {}
849 original_lang = original_lang_node.attrib['lang_code']
850 caption_kind = original_lang_node.attrib.get('kind', '')
851
852 sub_lang_list = {}
853 for lang_node in caption_list.findall('target'):
854 sub_lang = lang_node.attrib['lang_code']
855 sub_formats = []
856 for ext in ['sbv', 'vtt', 'srt']:
857 params = compat_urllib_parse.urlencode({
858 'lang': original_lang,
859 'tlang': sub_lang,
860 'fmt': ext,
861 'ts': timestamp,
862 'kind': caption_kind,
863 })
864 sub_formats.append({
865 'url': caption_url + '&' + params,
866 'ext': ext,
867 })
868 sub_lang_list[sub_lang] = sub_formats
869 return sub_lang_list
870 # An extractor error can be raise by the download process if there are
871 # no automatic captions but there are subtitles
872 except (KeyError, ExtractorError):
873 self._downloader.report_warning(err_msg)
874 return {}
875
876 @classmethod
877 def extract_id(cls, url):
878 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
879 if mobj is None:
880 raise ExtractorError('Invalid URL: %s' % url)
881 video_id = mobj.group(2)
882 return video_id
883
884 def _extract_from_m3u8(self, manifest_url, video_id):
885 url_map = {}
886
887 def _get_urls(_manifest):
888 lines = _manifest.split('\n')
889 urls = filter(lambda l: l and not l.startswith('#'),
890 lines)
891 return urls
892 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
893 formats_urls = _get_urls(manifest)
894 for format_url in formats_urls:
895 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
896 url_map[itag] = format_url
897 return url_map
898
899 def _extract_annotations(self, video_id):
900 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
901 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
902
903 def _parse_dash_manifest(
904 self, video_id, dash_manifest_url, player_url, age_gate, fatal=True):
905 def decrypt_sig(mobj):
906 s = mobj.group(1)
907 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
908 return '/signature/%s' % dec_s
909 dash_manifest_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, dash_manifest_url)
910 dash_doc = self._download_xml(
911 dash_manifest_url, video_id,
912 note='Downloading DASH manifest',
913 errnote='Could not download DASH manifest',
914 fatal=fatal)
915
916 if dash_doc is False:
917 return []
918
919 formats = []
920 for a in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}AdaptationSet'):
921 mime_type = a.attrib.get('mimeType')
922 for r in a.findall('{urn:mpeg:DASH:schema:MPD:2011}Representation'):
923 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
924 if url_el is None:
925 continue
926 if mime_type == 'text/vtt':
927 # TODO implement WebVTT downloading
928 pass
929 elif mime_type.startswith('audio/') or mime_type.startswith('video/'):
930 segment_list = r.find('{urn:mpeg:DASH:schema:MPD:2011}SegmentList')
931 format_id = r.attrib['id']
932 video_url = url_el.text
933 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
934 f = {
935 'format_id': format_id,
936 'url': video_url,
937 'width': int_or_none(r.attrib.get('width')),
938 'height': int_or_none(r.attrib.get('height')),
939 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
940 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
941 'filesize': filesize,
942 'fps': int_or_none(r.attrib.get('frameRate')),
943 }
944 if segment_list is not None:
945 f.update({
946 'initialization_url': segment_list.find('{urn:mpeg:DASH:schema:MPD:2011}Initialization').attrib['sourceURL'],
947 'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall('{urn:mpeg:DASH:schema:MPD:2011}SegmentURL')],
948 'protocol': 'http_dash_segments',
949 })
950 try:
951 existing_format = next(
952 fo for fo in formats
953 if fo['format_id'] == format_id)
954 except StopIteration:
955 full_info = self._formats.get(format_id, {}).copy()
956 full_info.update(f)
957 codecs = r.attrib.get('codecs')
958 if codecs:
959 if full_info.get('acodec') == 'none' and 'vcodec' not in full_info:
960 full_info['vcodec'] = codecs
961 elif full_info.get('vcodec') == 'none' and 'acodec' not in full_info:
962 full_info['acodec'] = codecs
963 formats.append(full_info)
964 else:
965 existing_format.update(f)
966 else:
967 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
968 return formats
969
970 def _real_extract(self, url):
971 url, smuggled_data = unsmuggle_url(url, {})
972
973 proto = (
974 'http' if self._downloader.params.get('prefer_insecure', False)
975 else 'https')
976
977 start_time = None
978 end_time = None
979 parsed_url = compat_urllib_parse_urlparse(url)
980 for component in [parsed_url.fragment, parsed_url.query]:
981 query = compat_parse_qs(component)
982 if start_time is None and 't' in query:
983 start_time = parse_duration(query['t'][0])
984 if start_time is None and 'start' in query:
985 start_time = parse_duration(query['start'][0])
986 if end_time is None and 'end' in query:
987 end_time = parse_duration(query['end'][0])
988
989 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
990 mobj = re.search(self._NEXT_URL_RE, url)
991 if mobj:
992 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
993 video_id = self.extract_id(url)
994
995 # Get video webpage
996 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
997 video_webpage = self._download_webpage(url, video_id)
998
999 # Attempt to extract SWF player URL
1000 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1001 if mobj is not None:
1002 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1003 else:
1004 player_url = None
1005
1006 dash_mpds = []
1007
1008 def add_dash_mpd(video_info):
1009 dash_mpd = video_info.get('dashmpd')
1010 if dash_mpd and dash_mpd[0] not in dash_mpds:
1011 dash_mpds.append(dash_mpd[0])
1012
1013 # Get video info
1014 embed_webpage = None
1015 is_live = None
1016 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1017 age_gate = True
1018 # We simulate the access to the video from www.youtube.com/v/{video_id}
1019 # this can be viewed without login into Youtube
1020 url = proto + '://www.youtube.com/embed/%s' % video_id
1021 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
1022 data = compat_urllib_parse.urlencode({
1023 'video_id': video_id,
1024 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1025 'sts': self._search_regex(
1026 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1027 })
1028 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1029 video_info_webpage = self._download_webpage(
1030 video_info_url, video_id,
1031 note='Refetching age-gated info webpage',
1032 errnote='unable to download video info webpage')
1033 video_info = compat_parse_qs(video_info_webpage)
1034 add_dash_mpd(video_info)
1035 else:
1036 age_gate = False
1037 video_info = None
1038 # Try looking directly into the video webpage
1039 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
1040 if mobj:
1041 json_code = uppercase_escape(mobj.group(1))
1042 ytplayer_config = json.loads(json_code)
1043 args = ytplayer_config['args']
1044 if args.get('url_encoded_fmt_stream_map'):
1045 # Convert to the same format returned by compat_parse_qs
1046 video_info = dict((k, [v]) for k, v in args.items())
1047 add_dash_mpd(video_info)
1048 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1049 is_live = True
1050 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1051 # We also try looking in get_video_info since it may contain different dashmpd
1052 # URL that points to a DASH manifest with possibly different itag set (some itags
1053 # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
1054 # manifest pointed by get_video_info's dashmpd).
1055 # The general idea is to take a union of itags of both DASH manifests (for example
1056 # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
1057 self.report_video_info_webpage_download(video_id)
1058 for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:
1059 video_info_url = (
1060 '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1061 % (proto, video_id, el_type))
1062 video_info_webpage = self._download_webpage(
1063 video_info_url,
1064 video_id, note=False,
1065 errnote='unable to download video info webpage')
1066 get_video_info = compat_parse_qs(video_info_webpage)
1067 if get_video_info.get('use_cipher_signature') != ['True']:
1068 add_dash_mpd(get_video_info)
1069 if not video_info:
1070 video_info = get_video_info
1071 if 'token' in get_video_info:
1072 break
1073 if 'token' not in video_info:
1074 if 'reason' in video_info:
1075 if 'The uploader has not made this video available in your country.' in video_info['reason']:
1076 regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None)
1077 if regions_allowed:
1078 raise ExtractorError('YouTube said: This video is available in %s only' % (
1079 ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))),
1080 expected=True)
1081 raise ExtractorError(
1082 'YouTube said: %s' % video_info['reason'][0],
1083 expected=True, video_id=video_id)
1084 else:
1085 raise ExtractorError(
1086 '"token" parameter not in video info for unknown reason',
1087 video_id=video_id)
1088
1089 # title
1090 if 'title' in video_info:
1091 video_title = video_info['title'][0]
1092 else:
1093 self._downloader.report_warning('Unable to extract video title')
1094 video_title = '_'
1095
1096 # description
1097 video_description = get_element_by_id("eow-description", video_webpage)
1098 if video_description:
1099 video_description = re.sub(r'''(?x)
1100 <a\s+
1101 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1102 title="([^"]+)"\s+
1103 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1104 class="yt-uix-redirect-link"\s*>
1105 [^<]+
1106 </a>
1107 ''', r'\1', video_description)
1108 video_description = clean_html(video_description)
1109 else:
1110 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1111 if fd_mobj:
1112 video_description = unescapeHTML(fd_mobj.group(1))
1113 else:
1114 video_description = ''
1115
1116 if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False):
1117 if not self._downloader.params.get('noplaylist'):
1118 entries = []
1119 feed_ids = []
1120 multifeed_metadata_list = compat_urllib_parse_unquote_plus(video_info['multifeed_metadata_list'][0])
1121 for feed in multifeed_metadata_list.split(','):
1122 feed_data = compat_parse_qs(feed)
1123 entries.append({
1124 '_type': 'url_transparent',
1125 'ie_key': 'Youtube',
1126 'url': smuggle_url(
1127 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1128 {'force_singlefeed': True}),
1129 'title': '%s (%s)' % (video_title, feed_data['title'][0]),
1130 })
1131 feed_ids.append(feed_data['id'][0])
1132 self.to_screen(
1133 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1134 % (', '.join(feed_ids), video_id))
1135 return self.playlist_result(entries, video_id, video_title, video_description)
1136 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1137
1138 if 'view_count' in video_info:
1139 view_count = int(video_info['view_count'][0])
1140 else:
1141 view_count = None
1142
1143 # Check for "rental" videos
1144 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1145 raise ExtractorError('"rental" videos not supported')
1146
1147 # Start extracting information
1148 self.report_information_extraction(video_id)
1149
1150 # uploader
1151 if 'author' not in video_info:
1152 raise ExtractorError('Unable to extract uploader name')
1153 video_uploader = compat_urllib_parse_unquote_plus(video_info['author'][0])
1154
1155 # uploader_id
1156 video_uploader_id = None
1157 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1158 if mobj is not None:
1159 video_uploader_id = mobj.group(1)
1160 else:
1161 self._downloader.report_warning('unable to extract uploader nickname')
1162
1163 # thumbnail image
1164 # We try first to get a high quality image:
1165 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1166 video_webpage, re.DOTALL)
1167 if m_thumb is not None:
1168 video_thumbnail = m_thumb.group(1)
1169 elif 'thumbnail_url' not in video_info:
1170 self._downloader.report_warning('unable to extract video thumbnail')
1171 video_thumbnail = None
1172 else: # don't panic if we can't find it
1173 video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
1174
1175 # upload date
1176 upload_date = self._html_search_meta(
1177 'datePublished', video_webpage, 'upload date', default=None)
1178 if not upload_date:
1179 upload_date = self._search_regex(
1180 [r'(?s)id="eow-date.*?>(.*?)</span>',
1181 r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'],
1182 video_webpage, 'upload date', default=None)
1183 if upload_date:
1184 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1185 upload_date = unified_strdate(upload_date)
1186
1187 m_cat_container = self._search_regex(
1188 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
1189 video_webpage, 'categories', default=None)
1190 if m_cat_container:
1191 category = self._html_search_regex(
1192 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
1193 default=None)
1194 video_categories = None if category is None else [category]
1195 else:
1196 video_categories = None
1197
1198 video_tags = [
1199 unescapeHTML(m.group('content'))
1200 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
1201
1202 def _extract_count(count_name):
1203 return str_to_int(self._search_regex(
1204 r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
1205 % re.escape(count_name),
1206 video_webpage, count_name, default=None))
1207
1208 like_count = _extract_count('like')
1209 dislike_count = _extract_count('dislike')
1210
1211 # subtitles
1212 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1213 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
1214
1215 if 'length_seconds' not in video_info:
1216 self._downloader.report_warning('unable to extract video duration')
1217 video_duration = None
1218 else:
1219 video_duration = int(compat_urllib_parse_unquote_plus(video_info['length_seconds'][0]))
1220
1221 # annotations
1222 video_annotations = None
1223 if self._downloader.params.get('writeannotations', False):
1224 video_annotations = self._extract_annotations(video_id)
1225
1226 def _map_to_format_list(urlmap):
1227 formats = []
1228 for itag, video_real_url in urlmap.items():
1229 dct = {
1230 'format_id': itag,
1231 'url': video_real_url,
1232 'player_url': player_url,
1233 }
1234 if itag in self._formats:
1235 dct.update(self._formats[itag])
1236 formats.append(dct)
1237 return formats
1238
1239 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1240 self.report_rtmp_download()
1241 formats = [{
1242 'format_id': '_rtmp',
1243 'protocol': 'rtmp',
1244 'url': video_info['conn'][0],
1245 'player_url': player_url,
1246 }]
1247 elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
1248 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
1249 if 'rtmpe%3Dyes' in encoded_url_map:
1250 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1251 url_map = {}
1252 for url_data_str in encoded_url_map.split(','):
1253 url_data = compat_parse_qs(url_data_str)
1254 if 'itag' not in url_data or 'url' not in url_data:
1255 continue
1256 format_id = url_data['itag'][0]
1257 url = url_data['url'][0]
1258
1259 if 'sig' in url_data:
1260 url += '&signature=' + url_data['sig'][0]
1261 elif 's' in url_data:
1262 encrypted_sig = url_data['s'][0]
1263 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
1264
1265 jsplayer_url_json = self._search_regex(
1266 ASSETS_RE,
1267 embed_webpage if age_gate else video_webpage,
1268 'JS player URL (1)', default=None)
1269 if not jsplayer_url_json and not age_gate:
1270 # We need the embed website after all
1271 if embed_webpage is None:
1272 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1273 embed_webpage = self._download_webpage(
1274 embed_url, video_id, 'Downloading embed webpage')
1275 jsplayer_url_json = self._search_regex(
1276 ASSETS_RE, embed_webpage, 'JS player URL')
1277
1278 player_url = json.loads(jsplayer_url_json)
1279 if player_url is None:
1280 player_url_json = self._search_regex(
1281 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
1282 video_webpage, 'age gate player URL')
1283 player_url = json.loads(player_url_json)
1284
1285 if self._downloader.params.get('verbose'):
1286 if player_url is None:
1287 player_version = 'unknown'
1288 player_desc = 'unknown'
1289 else:
1290 if player_url.endswith('swf'):
1291 player_version = self._search_regex(
1292 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
1293 'flash player', fatal=False)
1294 player_desc = 'flash player %s' % player_version
1295 else:
1296 player_version = self._search_regex(
1297 r'html5player-([^/]+?)(?:/html5player)?\.js',
1298 player_url,
1299 'html5 player', fatal=False)
1300 player_desc = 'html5 player %s' % player_version
1301
1302 parts_sizes = self._signature_cache_id(encrypted_sig)
1303 self.to_screen('{%s} signature length %s, %s' %
1304 (format_id, parts_sizes, player_desc))
1305
1306 signature = self._decrypt_signature(
1307 encrypted_sig, video_id, player_url, age_gate)
1308 url += '&signature=' + signature
1309 if 'ratebypass' not in url:
1310 url += '&ratebypass=yes'
1311 url_map[format_id] = url
1312 formats = _map_to_format_list(url_map)
1313 elif video_info.get('hlsvp'):
1314 manifest_url = video_info['hlsvp'][0]
1315 url_map = self._extract_from_m3u8(manifest_url, video_id)
1316 formats = _map_to_format_list(url_map)
1317 else:
1318 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1319
1320 # Look for the DASH manifest
1321 if self._downloader.params.get('youtube_include_dash_manifest', True):
1322 dash_mpd_fatal = True
1323 for dash_manifest_url in dash_mpds:
1324 dash_formats = {}
1325 try:
1326 for df in self._parse_dash_manifest(
1327 video_id, dash_manifest_url, player_url, age_gate, dash_mpd_fatal):
1328 # Do not overwrite DASH format found in some previous DASH manifest
1329 if df['format_id'] not in dash_formats:
1330 dash_formats[df['format_id']] = df
1331 # Additional DASH manifests may end up in HTTP Error 403 therefore
1332 # allow them to fail without bug report message if we already have
1333 # some DASH manifest succeeded. This is temporary workaround to reduce
1334 # burst of bug reports until we figure out the reason and whether it
1335 # can be fixed at all.
1336 dash_mpd_fatal = False
1337 except (ExtractorError, KeyError) as e:
1338 self.report_warning(
1339 'Skipping DASH manifest: %r' % e, video_id)
1340 if dash_formats:
1341 # Remove the formats we found through non-DASH, they
1342 # contain less info and it can be wrong, because we use
1343 # fixed values (for example the resolution). See
1344 # https://github.com/rg3/youtube-dl/issues/5774 for an
1345 # example.
1346 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
1347 formats.extend(dash_formats.values())
1348
1349 # Check for malformed aspect ratio
1350 stretched_m = re.search(
1351 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
1352 video_webpage)
1353 if stretched_m:
1354 ratio = float(stretched_m.group('w')) / float(stretched_m.group('h'))
1355 for f in formats:
1356 if f.get('vcodec') != 'none':
1357 f['stretched_ratio'] = ratio
1358
1359 self._sort_formats(formats)
1360
1361 return {
1362 'id': video_id,
1363 'uploader': video_uploader,
1364 'uploader_id': video_uploader_id,
1365 'upload_date': upload_date,
1366 'title': video_title,
1367 'thumbnail': video_thumbnail,
1368 'description': video_description,
1369 'categories': video_categories,
1370 'tags': video_tags,
1371 'subtitles': video_subtitles,
1372 'automatic_captions': automatic_captions,
1373 'duration': video_duration,
1374 'age_limit': 18 if age_gate else 0,
1375 'annotations': video_annotations,
1376 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1377 'view_count': view_count,
1378 'like_count': like_count,
1379 'dislike_count': dislike_count,
1380 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
1381 'formats': formats,
1382 'is_live': is_live,
1383 'start_time': start_time,
1384 'end_time': end_time,
1385 }
1386
1387
1388class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1389 IE_DESC = 'YouTube.com playlists'
1390 _VALID_URL = r"""(?x)(?:
1391 (?:https?://)?
1392 (?:\w+\.)?
1393 youtube\.com/
1394 (?:
1395 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
1396 \? (?:.*?&)*? (?:p|a|list)=
1397 | p/
1398 )
1399 (
1400 (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,}
1401 # Top tracks, they can also include dots
1402 |(?:MC)[\w\.]*
1403 )
1404 .*
1405 |
1406 ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})
1407 )"""
1408 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1409 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1410 IE_NAME = 'youtube:playlist'
1411 _TESTS = [{
1412 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1413 'info_dict': {
1414 'title': 'ytdl test PL',
1415 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1416 },
1417 'playlist_count': 3,
1418 }, {
1419 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1420 'info_dict': {
1421 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1422 'title': 'YDL_Empty_List',
1423 },
1424 'playlist_count': 0,
1425 }, {
1426 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1427 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1428 'info_dict': {
1429 'title': '29C3: Not my department',
1430 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1431 },
1432 'playlist_count': 95,
1433 }, {
1434 'note': 'issue #673',
1435 'url': 'PLBB231211A4F62143',
1436 'info_dict': {
1437 'title': '[OLD]Team Fortress 2 (Class-based LP)',
1438 'id': 'PLBB231211A4F62143',
1439 },
1440 'playlist_mincount': 26,
1441 }, {
1442 'note': 'Large playlist',
1443 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1444 'info_dict': {
1445 'title': 'Uploads from Cauchemar',
1446 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
1447 },
1448 'playlist_mincount': 799,
1449 }, {
1450 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1451 'info_dict': {
1452 'title': 'YDL_safe_search',
1453 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1454 },
1455 'playlist_count': 2,
1456 }, {
1457 'note': 'embedded',
1458 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1459 'playlist_count': 4,
1460 'info_dict': {
1461 'title': 'JODA15',
1462 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1463 }
1464 }, {
1465 'note': 'Embedded SWF player',
1466 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1467 'playlist_count': 4,
1468 'info_dict': {
1469 'title': 'JODA7',
1470 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
1471 }
1472 }, {
1473 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
1474 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
1475 'info_dict': {
1476 'title': 'Uploads from Interstellar Movie',
1477 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
1478 },
1479 'playlist_mincout': 21,
1480 }]
1481
1482 def _real_initialize(self):
1483 self._login()
1484
1485 def _extract_mix(self, playlist_id):
1486 # The mixes are generated from a single video
1487 # the id of the playlist is just 'RD' + video_id
1488 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1489 webpage = self._download_webpage(
1490 url, playlist_id, 'Downloading Youtube mix')
1491 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1492 title_span = (
1493 search_title('playlist-title') or
1494 search_title('title long-title') or
1495 search_title('title'))
1496 title = clean_html(title_span)
1497 ids = orderedSet(re.findall(
1498 r'''(?xs)data-video-username=".*?".*?
1499 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1500 webpage))
1501 url_results = self._ids_to_results(ids)
1502
1503 return self.playlist_result(url_results, playlist_id, title)
1504
1505 def _extract_playlist(self, playlist_id):
1506 url = self._TEMPLATE_URL % playlist_id
1507 page = self._download_webpage(url, playlist_id)
1508
1509 for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page):
1510 match = match.strip()
1511 # Check if the playlist exists or is private
1512 if re.match(r'[^<]*(The|This) playlist (does not exist|is private)[^<]*', match):
1513 raise ExtractorError(
1514 'The playlist doesn\'t exist or is private, use --username or '
1515 '--netrc to access it.',
1516 expected=True)
1517 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
1518 raise ExtractorError(
1519 'Invalid parameters. Maybe URL is incorrect.',
1520 expected=True)
1521 elif re.match(r'[^<]*Choose your language[^<]*', match):
1522 continue
1523 else:
1524 self.report_warning('Youtube gives an alert message: ' + match)
1525
1526 # Extract the video ids from the playlist pages
1527 def _entries():
1528 more_widget_html = content_html = page
1529 for page_num in itertools.count(1):
1530 matches = re.finditer(self._VIDEO_RE, content_html)
1531 # We remove the duplicates and the link with index 0
1532 # (it's not the first video of the playlist)
1533 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1534 for vid_id in new_ids:
1535 yield self.url_result(vid_id, 'Youtube', video_id=vid_id)
1536
1537 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1538 if not mobj:
1539 break
1540
1541 more = self._download_json(
1542 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1543 'Downloading page #%s' % page_num,
1544 transform_source=uppercase_escape)
1545 content_html = more['content_html']
1546 if not content_html.strip():
1547 # Some webpages show a "Load more" button but they don't
1548 # have more videos
1549 break
1550 more_widget_html = more['load_more_widget_html']
1551
1552 playlist_title = self._html_search_regex(
1553 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1554 page, 'title')
1555
1556 return self.playlist_result(_entries(), playlist_id, playlist_title)
1557
1558 def _real_extract(self, url):
1559 # Extract playlist id
1560 mobj = re.match(self._VALID_URL, url)
1561 if mobj is None:
1562 raise ExtractorError('Invalid URL: %s' % url)
1563 playlist_id = mobj.group(1) or mobj.group(2)
1564
1565 # Check if it's a video-specific URL
1566 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1567 if 'v' in query_dict:
1568 video_id = query_dict['v'][0]
1569 if self._downloader.params.get('noplaylist'):
1570 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1571 return self.url_result(video_id, 'Youtube', video_id=video_id)
1572 else:
1573 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1574
1575 if playlist_id.startswith('RD') or playlist_id.startswith('UL'):
1576 # Mixes require a custom extraction process
1577 return self._extract_mix(playlist_id)
1578
1579 return self._extract_playlist(playlist_id)
1580
1581
1582class YoutubeChannelIE(InfoExtractor):
1583 IE_DESC = 'YouTube.com channels'
1584 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
1585 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
1586 IE_NAME = 'youtube:channel'
1587 _TESTS = [{
1588 'note': 'paginated channel',
1589 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1590 'playlist_mincount': 91,
1591 'info_dict': {
1592 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
1593 }
1594 }]
1595
1596 @staticmethod
1597 def extract_videos_from_page(page):
1598 ids_in_page = []
1599 titles_in_page = []
1600 for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page):
1601 video_id = mobj.group('id')
1602 video_title = unescapeHTML(mobj.group('title'))
1603 try:
1604 idx = ids_in_page.index(video_id)
1605 if video_title and not titles_in_page[idx]:
1606 titles_in_page[idx] = video_title
1607 except ValueError:
1608 ids_in_page.append(video_id)
1609 titles_in_page.append(video_title)
1610 return zip(ids_in_page, titles_in_page)
1611
1612 def _real_extract(self, url):
1613 channel_id = self._match_id(url)
1614
1615 url = self._TEMPLATE_URL % channel_id
1616
1617 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
1618 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
1619 # otherwise fallback on channel by page extraction
1620 channel_page = self._download_webpage(
1621 url + '?view=57', channel_id,
1622 'Downloading channel page', fatal=False)
1623 channel_playlist_id = self._html_search_meta(
1624 'channelId', channel_page, 'channel id', default=None)
1625 if not channel_playlist_id:
1626 channel_playlist_id = self._search_regex(
1627 r'data-channel-external-id="([^"]+)"',
1628 channel_page, 'channel id', default=None)
1629 if channel_playlist_id and channel_playlist_id.startswith('UC'):
1630 playlist_id = 'UU' + channel_playlist_id[2:]
1631 return self.url_result(
1632 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
1633
1634 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
1635 autogenerated = re.search(r'''(?x)
1636 class="[^"]*?(?:
1637 channel-header-autogenerated-label|
1638 yt-channel-title-autogenerated
1639 )[^"]*"''', channel_page) is not None
1640
1641 if autogenerated:
1642 # The videos are contained in a single page
1643 # the ajax pages can't be used, they are empty
1644 entries = [
1645 self.url_result(
1646 video_id, 'Youtube', video_id=video_id,
1647 video_title=video_title)
1648 for video_id, video_title in self.extract_videos_from_page(channel_page)]
1649 return self.playlist_result(entries, channel_id)
1650
1651 def _entries():
1652 more_widget_html = content_html = channel_page
1653 for pagenum in itertools.count(1):
1654
1655 for video_id, video_title in self.extract_videos_from_page(content_html):
1656 yield self.url_result(
1657 video_id, 'Youtube', video_id=video_id,
1658 video_title=video_title)
1659
1660 mobj = re.search(
1661 r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
1662 more_widget_html)
1663 if not mobj:
1664 break
1665
1666 more = self._download_json(
1667 'https://youtube.com/%s' % mobj.group('more'), channel_id,
1668 'Downloading page #%s' % (pagenum + 1),
1669 transform_source=uppercase_escape)
1670 content_html = more['content_html']
1671 more_widget_html = more['load_more_widget_html']
1672
1673 return self.playlist_result(_entries(), channel_id)
1674
1675
1676class YoutubeUserIE(YoutubeChannelIE):
1677 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
1678 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
1679 _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos'
1680 IE_NAME = 'youtube:user'
1681
1682 _TESTS = [{
1683 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1684 'playlist_mincount': 320,
1685 'info_dict': {
1686 'title': 'TheLinuxFoundation',
1687 }
1688 }, {
1689 'url': 'ytuser:phihag',
1690 'only_matching': True,
1691 }]
1692
1693 @classmethod
1694 def suitable(cls, url):
1695 # Don't return True if the url can be extracted with other youtube
1696 # extractor, the regex would is too permissive and it would match.
1697 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1698 if any(ie.suitable(url) for ie in other_ies):
1699 return False
1700 else:
1701 return super(YoutubeUserIE, cls).suitable(url)
1702
1703
1704class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
1705 IE_DESC = 'YouTube.com searches'
1706 # there doesn't appear to be a real limit, for example if you search for
1707 # 'python' you get more than 8.000.000 results
1708 _MAX_RESULTS = float('inf')
1709 IE_NAME = 'youtube:search'
1710 _SEARCH_KEY = 'ytsearch'
1711 _EXTRA_QUERY_ARGS = {}
1712 _TESTS = []
1713
1714 def _get_n_results(self, query, n):
1715 """Get a specified number of results for a query"""
1716
1717 videos = []
1718 limit = n
1719
1720 for pagenum in itertools.count(1):
1721 url_query = {
1722 'search_query': query.encode('utf-8'),
1723 'page': pagenum,
1724 'spf': 'navigate',
1725 }
1726 url_query.update(self._EXTRA_QUERY_ARGS)
1727 result_url = 'https://www.youtube.com/results?' + compat_urllib_parse.urlencode(url_query)
1728 data = self._download_json(
1729 result_url, video_id='query "%s"' % query,
1730 note='Downloading page %s' % pagenum,
1731 errnote='Unable to download API page')
1732 html_content = data[1]['body']['content']
1733
1734 if 'class="search-message' in html_content:
1735 raise ExtractorError(
1736 '[youtube] No video results', expected=True)
1737
1738 new_videos = self._ids_to_results(orderedSet(re.findall(
1739 r'href="/watch\?v=(.{11})', html_content)))
1740 videos += new_videos
1741 if not new_videos or len(videos) > limit:
1742 break
1743
1744 if len(videos) > n:
1745 videos = videos[:n]
1746 return self.playlist_result(videos, query)
1747
1748
1749class YoutubeSearchDateIE(YoutubeSearchIE):
1750 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1751 _SEARCH_KEY = 'ytsearchdate'
1752 IE_DESC = 'YouTube.com searches, newest videos first'
1753 _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
1754
1755
1756class YoutubeSearchURLIE(InfoExtractor):
1757 IE_DESC = 'YouTube.com search URLs'
1758 IE_NAME = 'youtube:search_url'
1759 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1760 _TESTS = [{
1761 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1762 'playlist_mincount': 5,
1763 'info_dict': {
1764 'title': 'youtube-dl test video',
1765 }
1766 }]
1767
1768 def _real_extract(self, url):
1769 mobj = re.match(self._VALID_URL, url)
1770 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
1771
1772 webpage = self._download_webpage(url, query)
1773 result_code = self._search_regex(
1774 r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML')
1775
1776 part_codes = re.findall(
1777 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1778 entries = []
1779 for part_code in part_codes:
1780 part_title = self._html_search_regex(
1781 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1782 part_url_snippet = self._html_search_regex(
1783 r'(?s)href="([^"]+)"', part_code, 'item URL')
1784 part_url = compat_urlparse.urljoin(
1785 'https://www.youtube.com/', part_url_snippet)
1786 entries.append({
1787 '_type': 'url',
1788 'url': part_url,
1789 'title': part_title,
1790 })
1791
1792 return {
1793 '_type': 'playlist',
1794 'entries': entries,
1795 'title': query,
1796 }
1797
1798
1799class YoutubeShowIE(InfoExtractor):
1800 IE_DESC = 'YouTube.com (multi-season) shows'
1801 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
1802 IE_NAME = 'youtube:show'
1803 _TESTS = [{
1804 'url': 'http://www.youtube.com/show/airdisasters',
1805 'playlist_mincount': 3,
1806 'info_dict': {
1807 'id': 'airdisasters',
1808 'title': 'Air Disasters',
1809 }
1810 }]
1811
1812 def _real_extract(self, url):
1813 mobj = re.match(self._VALID_URL, url)
1814 playlist_id = mobj.group('id')
1815 webpage = self._download_webpage(
1816 url, playlist_id, 'Downloading show webpage')
1817 # There's one playlist for each season of the show
1818 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1819 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1820 entries = [
1821 self.url_result(
1822 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1823 for season in m_seasons
1824 ]
1825 title = self._og_search_title(webpage, fatal=False)
1826
1827 return {
1828 '_type': 'playlist',
1829 'id': playlist_id,
1830 'title': title,
1831 'entries': entries,
1832 }
1833
1834
1835class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1836 """
1837 Base class for feed extractors
1838 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1839 """
1840 _LOGIN_REQUIRED = True
1841
1842 @property
1843 def IE_NAME(self):
1844 return 'youtube:%s' % self._FEED_NAME
1845
1846 def _real_initialize(self):
1847 self._login()
1848
1849 def _real_extract(self, url):
1850 page = self._download_webpage(
1851 'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)
1852
1853 # The extraction process is the same as for playlists, but the regex
1854 # for the video ids doesn't contain an index
1855 ids = []
1856 more_widget_html = content_html = page
1857 for page_num in itertools.count(1):
1858 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1859
1860 # 'recommended' feed has infinite 'load more' and each new portion spins
1861 # the same videos in (sometimes) slightly different order, so we'll check
1862 # for unicity and break when portion has no new videos
1863 new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))
1864 if not new_ids:
1865 break
1866
1867 ids.extend(new_ids)
1868
1869 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1870 if not mobj:
1871 break
1872
1873 more = self._download_json(
1874 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
1875 'Downloading page #%s' % page_num,
1876 transform_source=uppercase_escape)
1877 content_html = more['content_html']
1878 more_widget_html = more['load_more_widget_html']
1879
1880 return self.playlist_result(
1881 self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE)
1882
1883
1884class YoutubeWatchLaterIE(YoutubePlaylistIE):
1885 IE_NAME = 'youtube:watchlater'
1886 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
1887 _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'
1888
1889 _TESTS = [] # override PlaylistIE tests
1890
1891 def _real_extract(self, url):
1892 return self._extract_playlist('WL')
1893
1894
1895class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1896 IE_NAME = 'youtube:favorites'
1897 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
1898 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1899 _LOGIN_REQUIRED = True
1900
1901 def _real_extract(self, url):
1902 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1903 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
1904 return self.url_result(playlist_id, 'YoutubePlaylist')
1905
1906
1907class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1908 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
1909 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1910 _FEED_NAME = 'recommended'
1911 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1912
1913
1914class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1915 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1916 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1917 _FEED_NAME = 'subscriptions'
1918 _PLAYLIST_TITLE = 'Youtube Subscriptions'
1919
1920
1921class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1922 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
1923 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1924 _FEED_NAME = 'history'
1925 _PLAYLIST_TITLE = 'Youtube History'
1926
1927
1928class YoutubeTruncatedURLIE(InfoExtractor):
1929 IE_NAME = 'youtube:truncated_url'
1930 IE_DESC = False # Do not list
1931 _VALID_URL = r'''(?x)
1932 (?:https?://)?
1933 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
1934 (?:watch\?(?:
1935 feature=[a-z_]+|
1936 annotation_id=annotation_[^&]+|
1937 x-yt-cl=[0-9]+|
1938 hl=[^&]*|
1939 )?
1940 |
1941 attribution_link\?a=[^&]+
1942 )
1943 $
1944 '''
1945
1946 _TESTS = [{
1947 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1948 'only_matching': True,
1949 }, {
1950 'url': 'http://www.youtube.com/watch?',
1951 'only_matching': True,
1952 }, {
1953 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
1954 'only_matching': True,
1955 }, {
1956 'url': 'https://www.youtube.com/watch?feature=foo',
1957 'only_matching': True,
1958 }, {
1959 'url': 'https://www.youtube.com/watch?hl=en-GB',
1960 'only_matching': True,
1961 }]
1962
1963 def _real_extract(self, url):
1964 raise ExtractorError(
1965 'Did you forget to quote the URL? Remember that & is a meta '
1966 'character in most shells, so you want to put the URL in quotes, '
1967 'like youtube-dl '
1968 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1969 ' or simply youtube-dl BaW_jenozKc .',
1970 expected=True)
1971
1972
1973class YoutubeTruncatedIDIE(InfoExtractor):
1974 IE_NAME = 'youtube:truncated_id'
1975 IE_DESC = False # Do not list
1976 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
1977
1978 _TESTS = [{
1979 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
1980 'only_matching': True,
1981 }]
1982
1983 def _real_extract(self, url):
1984 video_id = self._match_id(url)
1985 raise ExtractorError(
1986 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
1987 expected=True)