]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/youtube.py
[youtube] Add age limit to tests
[yt-dlp.git] / youtube_dl / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5
6 import itertools
7 import json
8 import os.path
9 import re
10 import time
11 import traceback
12
13 from .common import InfoExtractor, SearchInfoExtractor
14 from ..jsinterp import JSInterpreter
15 from ..swfinterp import SWFInterpreter
16 from ..compat import (
17 compat_chr,
18 compat_parse_qs,
19 compat_urllib_parse,
20 compat_urllib_parse_unquote,
21 compat_urllib_parse_unquote_plus,
22 compat_urllib_parse_urlparse,
23 compat_urllib_request,
24 compat_urlparse,
25 compat_str,
26 )
27 from ..utils import (
28 clean_html,
29 ExtractorError,
30 float_or_none,
31 get_element_by_attribute,
32 get_element_by_id,
33 int_or_none,
34 orderedSet,
35 parse_duration,
36 smuggle_url,
37 str_to_int,
38 unescapeHTML,
39 unified_strdate,
40 unsmuggle_url,
41 uppercase_escape,
42 ISO3166Utils,
43 )
44
45
46 class YoutubeBaseInfoExtractor(InfoExtractor):
47 """Provide base functions for Youtube extractors"""
48 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
49 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
50 _NETRC_MACHINE = 'youtube'
51 # If True it will raise an error if no login info is provided
52 _LOGIN_REQUIRED = False
53
54 def _set_language(self):
55 self._set_cookie(
56 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
57 # YouTube sets the expire time to about two months
58 expire_time=time.time() + 2 * 30 * 24 * 3600)
59
60 def _ids_to_results(self, ids):
61 return [
62 self.url_result(vid_id, 'Youtube', video_id=vid_id)
63 for vid_id in ids]
64
65 def _login(self):
66 """
67 Attempt to log in to YouTube.
68 True is returned if successful or skipped.
69 False is returned if login failed.
70
71 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
72 """
73 (username, password) = self._get_login_info()
74 # No authentication to be performed
75 if username is None:
76 if self._LOGIN_REQUIRED:
77 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
78 return True
79
80 login_page = self._download_webpage(
81 self._LOGIN_URL, None,
82 note='Downloading login page',
83 errnote='unable to fetch login page', fatal=False)
84 if login_page is False:
85 return
86
87 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
88 login_page, 'Login GALX parameter')
89
90 # Log in
91 login_form_strs = {
92 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
93 'Email': username,
94 'GALX': galx,
95 'Passwd': password,
96
97 'PersistentCookie': 'yes',
98 '_utf8': '霱',
99 'bgresponse': 'js_disabled',
100 'checkConnection': '',
101 'checkedDomains': 'youtube',
102 'dnConn': '',
103 'pstMsg': '0',
104 'rmShown': '1',
105 'secTok': '',
106 'signIn': 'Sign in',
107 'timeStmp': '',
108 'service': 'youtube',
109 'uilel': '3',
110 'hl': 'en_US',
111 }
112
113 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
114 # chokes on unicode
115 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
116 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
117
118 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
119 login_results = self._download_webpage(
120 req, None,
121 note='Logging in', errnote='unable to log in', fatal=False)
122 if login_results is False:
123 return False
124
125 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
126 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
127
128 # Two-Factor
129 # TODO add SMS and phone call support - these require making a request and then prompting the user
130
131 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
132 tfa_code = self._get_tfa_info()
133
134 if tfa_code is None:
135 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
136 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
137 return False
138
139 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
140
141 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
142 if match is None:
143 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
144 secTok = match.group(1)
145 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
146 if match is None:
147 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
148 timeStmp = match.group(1)
149
150 tfa_form_strs = {
151 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
152 'smsToken': '',
153 'smsUserPin': tfa_code,
154 'smsVerifyPin': 'Verify',
155
156 'PersistentCookie': 'yes',
157 'checkConnection': '',
158 'checkedDomains': 'youtube',
159 'pstMsg': '1',
160 'secTok': secTok,
161 'timeStmp': timeStmp,
162 'service': 'youtube',
163 'hl': 'en_US',
164 }
165 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
166 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
167
168 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
169 tfa_results = self._download_webpage(
170 tfa_req, None,
171 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
172
173 if tfa_results is False:
174 return False
175
176 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
177 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
178 return False
179 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
180 self._downloader.report_warning('unable to log in - did the page structure change?')
181 return False
182 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
183 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
184 return False
185
186 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
187 self._downloader.report_warning('unable to log in: bad username or password')
188 return False
189 return True
190
191 def _real_initialize(self):
192 if self._downloader is None:
193 return
194 self._set_language()
195 if not self._login():
196 return
197
198
199 class YoutubeIE(YoutubeBaseInfoExtractor):
200 IE_DESC = 'YouTube.com'
201 _VALID_URL = r"""(?x)^
202 (
203 (?:https?://|//) # http(s):// or protocol-independent URL
204 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
205 (?:www\.)?deturl\.com/www\.youtube\.com/|
206 (?:www\.)?pwnyoutube\.com/|
207 (?:www\.)?yourepeat\.com/|
208 tube\.majestyc\.net/|
209 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
210 (?:.*?\#/)? # handle anchor (#/) redirect urls
211 (?: # the various things that can precede the ID:
212 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
213 |(?: # or the v= param in all its forms
214 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
215 (?:\?|\#!?) # the params delimiter ? or # or #!
216 (?:.*?&)?? # any other preceding param (like /?s=tuff&v=xxxx)
217 v=
218 )
219 ))
220 |youtu\.be/ # just youtu.be/xxxx
221 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
222 )
223 )? # all until now is optional -> you can pass the naked ID
224 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
225 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
226 (?(1).+)? # if we found the ID, everything can follow
227 $"""
228 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
229 _formats = {
230 '5': {'ext': 'flv', 'width': 400, 'height': 240},
231 '6': {'ext': 'flv', 'width': 450, 'height': 270},
232 '13': {'ext': '3gp'},
233 '17': {'ext': '3gp', 'width': 176, 'height': 144},
234 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
235 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
236 '34': {'ext': 'flv', 'width': 640, 'height': 360},
237 '35': {'ext': 'flv', 'width': 854, 'height': 480},
238 '36': {'ext': '3gp', 'width': 320, 'height': 240},
239 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
240 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
241 '43': {'ext': 'webm', 'width': 640, 'height': 360},
242 '44': {'ext': 'webm', 'width': 854, 'height': 480},
243 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
244 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
245 '59': {'ext': 'mp4', 'width': 854, 'height': 480},
246 '78': {'ext': 'mp4', 'width': 854, 'height': 480},
247
248
249 # 3d videos
250 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
251 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
252 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
253 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
254 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
255 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
256 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
257
258 # Apple HTTP Live Streaming
259 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
260 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
261 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
262 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
263 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
264 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
265 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
266
267 # DASH mp4 video
268 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
269 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
270 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
271 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
272 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
273 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
274 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
275 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
276 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
277 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
278 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
279
280 # Dash mp4 audio
281 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
282 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
283 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
284
285 # Dash webm
286 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
287 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
288 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
289 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
290 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
291 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
292 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'vp9'},
293 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
294 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
295 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
296 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
297 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
298 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
299 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
300 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
301 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
302 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
303 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
304 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
305 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'vp9'},
306 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
307
308 # Dash webm audio
309 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
310 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
311
312 # Dash webm audio with opus inside
313 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
314 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
315 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
316
317 # RTMP (unnamed)
318 '_rtmp': {'protocol': 'rtmp'},
319 }
320
321 IE_NAME = 'youtube'
322 _TESTS = [
323 {
324 'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&t=1s&end=9',
325 'info_dict': {
326 'id': 'BaW_jenozKc',
327 'ext': 'mp4',
328 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
329 'uploader': 'Philipp Hagemeister',
330 'uploader_id': 'phihag',
331 'upload_date': '20121002',
332 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
333 'categories': ['Science & Technology'],
334 'tags': ['youtube-dl'],
335 'like_count': int,
336 'dislike_count': int,
337 'start_time': 1,
338 'end_time': 9,
339 }
340 },
341 {
342 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
343 'note': 'Test generic use_cipher_signature video (#897)',
344 'info_dict': {
345 'id': 'UxxajLWwzqY',
346 'ext': 'mp4',
347 'upload_date': '20120506',
348 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
349 'description': 'md5:782e8651347686cba06e58f71ab51773',
350 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
351 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
352 'iconic ep', 'iconic', 'love', 'it'],
353 'uploader': 'Icona Pop',
354 'uploader_id': 'IconaPop',
355 }
356 },
357 {
358 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
359 'note': 'Test VEVO video with age protection (#956)',
360 'info_dict': {
361 'id': '07FYdnEawAQ',
362 'ext': 'mp4',
363 'upload_date': '20130703',
364 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
365 'description': 'md5:64249768eec3bc4276236606ea996373',
366 'uploader': 'justintimberlakeVEVO',
367 'uploader_id': 'justintimberlakeVEVO',
368 'age_limit': 18,
369 }
370 },
371 {
372 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
373 'note': 'Embed-only video (#1746)',
374 'info_dict': {
375 'id': 'yZIXLfi8CZQ',
376 'ext': 'mp4',
377 'upload_date': '20120608',
378 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
379 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
380 'uploader': 'SET India',
381 'uploader_id': 'setindia'
382 }
383 },
384 {
385 'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&v=UxxajLWwzqY',
386 'note': 'Use the first video ID in the URL',
387 'info_dict': {
388 'id': 'BaW_jenozKc',
389 'ext': 'mp4',
390 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
391 'uploader': 'Philipp Hagemeister',
392 'uploader_id': 'phihag',
393 'upload_date': '20121002',
394 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
395 'categories': ['Science & Technology'],
396 'tags': ['youtube-dl'],
397 'like_count': int,
398 'dislike_count': int,
399 },
400 'params': {
401 'skip_download': True,
402 },
403 },
404 {
405 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
406 'note': '256k DASH audio (format 141) via DASH manifest',
407 'info_dict': {
408 'id': 'a9LDPn-MO4I',
409 'ext': 'm4a',
410 'upload_date': '20121002',
411 'uploader_id': '8KVIDEO',
412 'description': '',
413 'uploader': '8KVIDEO',
414 'title': 'UHDTV TEST 8K VIDEO.mp4'
415 },
416 'params': {
417 'youtube_include_dash_manifest': True,
418 'format': '141',
419 },
420 },
421 # DASH manifest with encrypted signature
422 {
423 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
424 'info_dict': {
425 'id': 'IB3lcPjvWLA',
426 'ext': 'm4a',
427 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
428 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
429 'uploader': 'AfrojackVEVO',
430 'uploader_id': 'AfrojackVEVO',
431 'upload_date': '20131011',
432 },
433 'params': {
434 'youtube_include_dash_manifest': True,
435 'format': '141',
436 },
437 },
438 # JS player signature function name containing $
439 {
440 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
441 'info_dict': {
442 'id': 'nfWlot6h_JM',
443 'ext': 'm4a',
444 'title': 'Taylor Swift - Shake It Off',
445 'description': 'md5:2acfda1b285bdd478ccec22f9918199d',
446 'uploader': 'TaylorSwiftVEVO',
447 'uploader_id': 'TaylorSwiftVEVO',
448 'upload_date': '20140818',
449 },
450 'params': {
451 'youtube_include_dash_manifest': True,
452 'format': '141',
453 },
454 },
455 # Controversy video
456 {
457 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
458 'info_dict': {
459 'id': 'T4XJQO3qol8',
460 'ext': 'mp4',
461 'upload_date': '20100909',
462 'uploader': 'The Amazing Atheist',
463 'uploader_id': 'TheAmazingAtheist',
464 'title': 'Burning Everyone\'s Koran',
465 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
466 }
467 },
468 # Normal age-gate video (No vevo, embed allowed)
469 {
470 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
471 'info_dict': {
472 'id': 'HtVdAasjOgU',
473 'ext': 'mp4',
474 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
475 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
476 'uploader': 'The Witcher',
477 'uploader_id': 'WitcherGame',
478 'upload_date': '20140605',
479 'age_limit': 18,
480 },
481 },
482 # Age-gate video with encrypted signature
483 {
484 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',
485 'info_dict': {
486 'id': '6kLq3WMV1nU',
487 'ext': 'mp4',
488 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
489 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
490 'uploader': 'LloydVEVO',
491 'uploader_id': 'LloydVEVO',
492 'upload_date': '20110629',
493 'age_limit': 18,
494 },
495 },
496 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
497 {
498 'url': '__2ABJjxzNo',
499 'info_dict': {
500 'id': '__2ABJjxzNo',
501 'ext': 'mp4',
502 'upload_date': '20100430',
503 'uploader_id': 'deadmau5',
504 'description': 'md5:12c56784b8032162bb936a5f76d55360',
505 'uploader': 'deadmau5',
506 'title': 'Deadmau5 - Some Chords (HD)',
507 },
508 'expected_warnings': [
509 'DASH manifest missing',
510 ]
511 },
512 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
513 {
514 'url': 'lqQg6PlCWgI',
515 'info_dict': {
516 'id': 'lqQg6PlCWgI',
517 'ext': 'mp4',
518 'upload_date': '20120731',
519 'uploader_id': 'olympic',
520 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
521 'uploader': 'Olympics',
522 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
523 },
524 'params': {
525 'skip_download': 'requires avconv',
526 }
527 },
528 # Non-square pixels
529 {
530 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
531 'info_dict': {
532 'id': '_b-2C3KPAM0',
533 'ext': 'mp4',
534 'stretched_ratio': 16 / 9.,
535 'upload_date': '20110310',
536 'uploader_id': 'AllenMeow',
537 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
538 'uploader': '孫艾倫',
539 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
540 },
541 },
542 # url_encoded_fmt_stream_map is empty string
543 {
544 'url': 'qEJwOuvDf7I',
545 'info_dict': {
546 'id': 'qEJwOuvDf7I',
547 'ext': 'mp4',
548 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
549 'description': '',
550 'upload_date': '20150404',
551 'uploader_id': 'spbelect',
552 'uploader': 'Наблюдатели Петербурга',
553 },
554 'params': {
555 'skip_download': 'requires avconv',
556 }
557 },
558 # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)
559 {
560 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
561 'info_dict': {
562 'id': 'FIl7x6_3R5Y',
563 'ext': 'mp4',
564 'title': 'md5:7b81415841e02ecd4313668cde88737a',
565 'description': 'md5:116377fd2963b81ec4ce64b542173306',
566 'upload_date': '20150625',
567 'uploader_id': 'dorappi2000',
568 'uploader': 'dorappi2000',
569 'formats': 'mincount:33',
570 },
571 },
572 # DASH manifest with segment_list
573 {
574 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
575 'md5': '8ce563a1d667b599d21064e982ab9e31',
576 'info_dict': {
577 'id': 'CsmdDsKjzN8',
578 'ext': 'mp4',
579 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
580 'uploader': 'Airtek',
581 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
582 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
583 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
584 },
585 'params': {
586 'youtube_include_dash_manifest': True,
587 'format': '135', # bestvideo
588 }
589 },
590 {
591 # Multifeed videos (multiple cameras), URL is for Main Camera
592 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
593 'info_dict': {
594 'id': 'jqWvoWXjCVs',
595 'title': 'teamPGP: Rocket League Noob Stream',
596 'description': 'md5:dc7872fb300e143831327f1bae3af010',
597 },
598 'playlist': [{
599 'info_dict': {
600 'id': 'jqWvoWXjCVs',
601 'ext': 'mp4',
602 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
603 'description': 'md5:dc7872fb300e143831327f1bae3af010',
604 'upload_date': '20150721',
605 'uploader': 'Beer Games Beer',
606 'uploader_id': 'beergamesbeer',
607 },
608 }, {
609 'info_dict': {
610 'id': '6h8e8xoXJzg',
611 'ext': 'mp4',
612 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
613 'description': 'md5:dc7872fb300e143831327f1bae3af010',
614 'upload_date': '20150721',
615 'uploader': 'Beer Games Beer',
616 'uploader_id': 'beergamesbeer',
617 },
618 }, {
619 'info_dict': {
620 'id': 'PUOgX5z9xZw',
621 'ext': 'mp4',
622 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
623 'description': 'md5:dc7872fb300e143831327f1bae3af010',
624 'upload_date': '20150721',
625 'uploader': 'Beer Games Beer',
626 'uploader_id': 'beergamesbeer',
627 },
628 }, {
629 'info_dict': {
630 'id': 'teuwxikvS5k',
631 'ext': 'mp4',
632 'title': 'teamPGP: Rocket League Noob Stream (zim)',
633 'description': 'md5:dc7872fb300e143831327f1bae3af010',
634 'upload_date': '20150721',
635 'uploader': 'Beer Games Beer',
636 'uploader_id': 'beergamesbeer',
637 },
638 }],
639 'params': {
640 'skip_download': True,
641 },
642 }
643 ]
644
645 def __init__(self, *args, **kwargs):
646 super(YoutubeIE, self).__init__(*args, **kwargs)
647 self._player_cache = {}
648
649 def report_video_info_webpage_download(self, video_id):
650 """Report attempt to download video info webpage."""
651 self.to_screen('%s: Downloading video info webpage' % video_id)
652
653 def report_information_extraction(self, video_id):
654 """Report attempt to extract video information."""
655 self.to_screen('%s: Extracting video information' % video_id)
656
657 def report_unavailable_format(self, video_id, format):
658 """Report extracted video URL."""
659 self.to_screen('%s: Format %s not available' % (video_id, format))
660
661 def report_rtmp_download(self):
662 """Indicate the download will use the RTMP protocol."""
663 self.to_screen('RTMP download detected')
664
665 def _signature_cache_id(self, example_sig):
666 """ Return a string representation of a signature """
667 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
668
669 def _extract_signature_function(self, video_id, player_url, example_sig):
670 id_m = re.match(
671 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
672 player_url)
673 if not id_m:
674 raise ExtractorError('Cannot identify player %r' % player_url)
675 player_type = id_m.group('ext')
676 player_id = id_m.group('id')
677
678 # Read from filesystem cache
679 func_id = '%s_%s_%s' % (
680 player_type, player_id, self._signature_cache_id(example_sig))
681 assert os.path.basename(func_id) == func_id
682
683 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
684 if cache_spec is not None:
685 return lambda s: ''.join(s[i] for i in cache_spec)
686
687 download_note = (
688 'Downloading player %s' % player_url
689 if self._downloader.params.get('verbose') else
690 'Downloading %s player %s' % (player_type, player_id)
691 )
692 if player_type == 'js':
693 code = self._download_webpage(
694 player_url, video_id,
695 note=download_note,
696 errnote='Download of %s failed' % player_url)
697 res = self._parse_sig_js(code)
698 elif player_type == 'swf':
699 urlh = self._request_webpage(
700 player_url, video_id,
701 note=download_note,
702 errnote='Download of %s failed' % player_url)
703 code = urlh.read()
704 res = self._parse_sig_swf(code)
705 else:
706 assert False, 'Invalid player type %r' % player_type
707
708 test_string = ''.join(map(compat_chr, range(len(example_sig))))
709 cache_res = res(test_string)
710 cache_spec = [ord(c) for c in cache_res]
711
712 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
713 return res
714
715 def _print_sig_code(self, func, example_sig):
716 def gen_sig_code(idxs):
717 def _genslice(start, end, step):
718 starts = '' if start == 0 else str(start)
719 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
720 steps = '' if step == 1 else (':%d' % step)
721 return 's[%s%s%s]' % (starts, ends, steps)
722
723 step = None
724 # Quelch pyflakes warnings - start will be set when step is set
725 start = '(Never used)'
726 for i, prev in zip(idxs[1:], idxs[:-1]):
727 if step is not None:
728 if i - prev == step:
729 continue
730 yield _genslice(start, prev, step)
731 step = None
732 continue
733 if i - prev in [-1, 1]:
734 step = i - prev
735 start = prev
736 continue
737 else:
738 yield 's[%d]' % prev
739 if step is None:
740 yield 's[%d]' % i
741 else:
742 yield _genslice(start, i, step)
743
744 test_string = ''.join(map(compat_chr, range(len(example_sig))))
745 cache_res = func(test_string)
746 cache_spec = [ord(c) for c in cache_res]
747 expr_code = ' + '.join(gen_sig_code(cache_spec))
748 signature_id_tuple = '(%s)' % (
749 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
750 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
751 ' return %s\n') % (signature_id_tuple, expr_code)
752 self.to_screen('Extracted signature function:\n' + code)
753
754 def _parse_sig_js(self, jscode):
755 funcname = self._search_regex(
756 r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode,
757 'Initial JS player signature function name')
758
759 jsi = JSInterpreter(jscode)
760 initial_function = jsi.extract_function(funcname)
761 return lambda s: initial_function([s])
762
763 def _parse_sig_swf(self, file_contents):
764 swfi = SWFInterpreter(file_contents)
765 TARGET_CLASSNAME = 'SignatureDecipher'
766 searched_class = swfi.extract_class(TARGET_CLASSNAME)
767 initial_function = swfi.extract_function(searched_class, 'decipher')
768 return lambda s: initial_function([s])
769
770 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
771 """Turn the encrypted s field into a working signature"""
772
773 if player_url is None:
774 raise ExtractorError('Cannot decrypt signature without player_url')
775
776 if player_url.startswith('//'):
777 player_url = 'https:' + player_url
778 try:
779 player_id = (player_url, self._signature_cache_id(s))
780 if player_id not in self._player_cache:
781 func = self._extract_signature_function(
782 video_id, player_url, s
783 )
784 self._player_cache[player_id] = func
785 func = self._player_cache[player_id]
786 if self._downloader.params.get('youtube_print_sig_code'):
787 self._print_sig_code(func, s)
788 return func(s)
789 except Exception as e:
790 tb = traceback.format_exc()
791 raise ExtractorError(
792 'Signature extraction failed: ' + tb, cause=e)
793
794 def _get_subtitles(self, video_id, webpage):
795 try:
796 subs_doc = self._download_xml(
797 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
798 video_id, note=False)
799 except ExtractorError as err:
800 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
801 return {}
802
803 sub_lang_list = {}
804 for track in subs_doc.findall('track'):
805 lang = track.attrib['lang_code']
806 if lang in sub_lang_list:
807 continue
808 sub_formats = []
809 for ext in ['sbv', 'vtt', 'srt']:
810 params = compat_urllib_parse.urlencode({
811 'lang': lang,
812 'v': video_id,
813 'fmt': ext,
814 'name': track.attrib['name'].encode('utf-8'),
815 })
816 sub_formats.append({
817 'url': 'https://www.youtube.com/api/timedtext?' + params,
818 'ext': ext,
819 })
820 sub_lang_list[lang] = sub_formats
821 if not sub_lang_list:
822 self._downloader.report_warning('video doesn\'t have subtitles')
823 return {}
824 return sub_lang_list
825
826 def _get_automatic_captions(self, video_id, webpage):
827 """We need the webpage for getting the captions url, pass it as an
828 argument to speed up the process."""
829 self.to_screen('%s: Looking for automatic captions' % video_id)
830 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
831 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
832 if mobj is None:
833 self._downloader.report_warning(err_msg)
834 return {}
835 player_config = json.loads(mobj.group(1))
836 try:
837 args = player_config['args']
838 caption_url = args['ttsurl']
839 timestamp = args['timestamp']
840 # We get the available subtitles
841 list_params = compat_urllib_parse.urlencode({
842 'type': 'list',
843 'tlangs': 1,
844 'asrs': 1,
845 })
846 list_url = caption_url + '&' + list_params
847 caption_list = self._download_xml(list_url, video_id)
848 original_lang_node = caption_list.find('track')
849 if original_lang_node is None:
850 self._downloader.report_warning('Video doesn\'t have automatic captions')
851 return {}
852 original_lang = original_lang_node.attrib['lang_code']
853 caption_kind = original_lang_node.attrib.get('kind', '')
854
855 sub_lang_list = {}
856 for lang_node in caption_list.findall('target'):
857 sub_lang = lang_node.attrib['lang_code']
858 sub_formats = []
859 for ext in ['sbv', 'vtt', 'srt']:
860 params = compat_urllib_parse.urlencode({
861 'lang': original_lang,
862 'tlang': sub_lang,
863 'fmt': ext,
864 'ts': timestamp,
865 'kind': caption_kind,
866 })
867 sub_formats.append({
868 'url': caption_url + '&' + params,
869 'ext': ext,
870 })
871 sub_lang_list[sub_lang] = sub_formats
872 return sub_lang_list
873 # An extractor error can be raise by the download process if there are
874 # no automatic captions but there are subtitles
875 except (KeyError, ExtractorError):
876 self._downloader.report_warning(err_msg)
877 return {}
878
879 @classmethod
880 def extract_id(cls, url):
881 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
882 if mobj is None:
883 raise ExtractorError('Invalid URL: %s' % url)
884 video_id = mobj.group(2)
885 return video_id
886
887 def _extract_from_m3u8(self, manifest_url, video_id):
888 url_map = {}
889
890 def _get_urls(_manifest):
891 lines = _manifest.split('\n')
892 urls = filter(lambda l: l and not l.startswith('#'),
893 lines)
894 return urls
895 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
896 formats_urls = _get_urls(manifest)
897 for format_url in formats_urls:
898 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
899 url_map[itag] = format_url
900 return url_map
901
902 def _extract_annotations(self, video_id):
903 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
904 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
905
906 def _parse_dash_manifest(
907 self, video_id, dash_manifest_url, player_url, age_gate, fatal=True):
908 def decrypt_sig(mobj):
909 s = mobj.group(1)
910 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
911 return '/signature/%s' % dec_s
912 dash_manifest_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, dash_manifest_url)
913 dash_doc = self._download_xml(
914 dash_manifest_url, video_id,
915 note='Downloading DASH manifest',
916 errnote='Could not download DASH manifest',
917 fatal=fatal)
918
919 if dash_doc is False:
920 return []
921
922 formats = []
923 for a in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}AdaptationSet'):
924 mime_type = a.attrib.get('mimeType')
925 for r in a.findall('{urn:mpeg:DASH:schema:MPD:2011}Representation'):
926 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
927 if url_el is None:
928 continue
929 if mime_type == 'text/vtt':
930 # TODO implement WebVTT downloading
931 pass
932 elif mime_type.startswith('audio/') or mime_type.startswith('video/'):
933 segment_list = r.find('{urn:mpeg:DASH:schema:MPD:2011}SegmentList')
934 format_id = r.attrib['id']
935 video_url = url_el.text
936 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
937 f = {
938 'format_id': format_id,
939 'url': video_url,
940 'width': int_or_none(r.attrib.get('width')),
941 'height': int_or_none(r.attrib.get('height')),
942 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
943 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
944 'filesize': filesize,
945 'fps': int_or_none(r.attrib.get('frameRate')),
946 }
947 if segment_list is not None:
948 f.update({
949 'initialization_url': segment_list.find('{urn:mpeg:DASH:schema:MPD:2011}Initialization').attrib['sourceURL'],
950 'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall('{urn:mpeg:DASH:schema:MPD:2011}SegmentURL')],
951 'protocol': 'http_dash_segments',
952 })
953 try:
954 existing_format = next(
955 fo for fo in formats
956 if fo['format_id'] == format_id)
957 except StopIteration:
958 full_info = self._formats.get(format_id, {}).copy()
959 full_info.update(f)
960 codecs = r.attrib.get('codecs')
961 if codecs:
962 if full_info.get('acodec') == 'none' and 'vcodec' not in full_info:
963 full_info['vcodec'] = codecs
964 elif full_info.get('vcodec') == 'none' and 'acodec' not in full_info:
965 full_info['acodec'] = codecs
966 formats.append(full_info)
967 else:
968 existing_format.update(f)
969 else:
970 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
971 return formats
972
973 def _real_extract(self, url):
974 url, smuggled_data = unsmuggle_url(url, {})
975
976 proto = (
977 'http' if self._downloader.params.get('prefer_insecure', False)
978 else 'https')
979
980 start_time = None
981 end_time = None
982 parsed_url = compat_urllib_parse_urlparse(url)
983 for component in [parsed_url.fragment, parsed_url.query]:
984 query = compat_parse_qs(component)
985 if start_time is None and 't' in query:
986 start_time = parse_duration(query['t'][0])
987 if start_time is None and 'start' in query:
988 start_time = parse_duration(query['start'][0])
989 if end_time is None and 'end' in query:
990 end_time = parse_duration(query['end'][0])
991
992 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
993 mobj = re.search(self._NEXT_URL_RE, url)
994 if mobj:
995 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
996 video_id = self.extract_id(url)
997
998 # Get video webpage
999 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
1000 video_webpage = self._download_webpage(url, video_id)
1001
1002 # Attempt to extract SWF player URL
1003 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1004 if mobj is not None:
1005 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1006 else:
1007 player_url = None
1008
1009 dash_mpds = []
1010
1011 def add_dash_mpd(video_info):
1012 dash_mpd = video_info.get('dashmpd')
1013 if dash_mpd and dash_mpd[0] not in dash_mpds:
1014 dash_mpds.append(dash_mpd[0])
1015
1016 # Get video info
1017 embed_webpage = None
1018 is_live = None
1019 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1020 age_gate = True
1021 # We simulate the access to the video from www.youtube.com/v/{video_id}
1022 # this can be viewed without login into Youtube
1023 url = proto + '://www.youtube.com/embed/%s' % video_id
1024 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
1025 data = compat_urllib_parse.urlencode({
1026 'video_id': video_id,
1027 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1028 'sts': self._search_regex(
1029 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1030 })
1031 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1032 video_info_webpage = self._download_webpage(
1033 video_info_url, video_id,
1034 note='Refetching age-gated info webpage',
1035 errnote='unable to download video info webpage')
1036 video_info = compat_parse_qs(video_info_webpage)
1037 add_dash_mpd(video_info)
1038 else:
1039 age_gate = False
1040 video_info = None
1041 # Try looking directly into the video webpage
1042 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
1043 if mobj:
1044 json_code = uppercase_escape(mobj.group(1))
1045 ytplayer_config = json.loads(json_code)
1046 args = ytplayer_config['args']
1047 if args.get('url_encoded_fmt_stream_map'):
1048 # Convert to the same format returned by compat_parse_qs
1049 video_info = dict((k, [v]) for k, v in args.items())
1050 add_dash_mpd(video_info)
1051 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1052 is_live = True
1053 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1054 # We also try looking in get_video_info since it may contain different dashmpd
1055 # URL that points to a DASH manifest with possibly different itag set (some itags
1056 # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
1057 # manifest pointed by get_video_info's dashmpd).
1058 # The general idea is to take a union of itags of both DASH manifests (for example
1059 # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
1060 self.report_video_info_webpage_download(video_id)
1061 for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:
1062 video_info_url = (
1063 '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1064 % (proto, video_id, el_type))
1065 video_info_webpage = self._download_webpage(
1066 video_info_url,
1067 video_id, note=False,
1068 errnote='unable to download video info webpage')
1069 get_video_info = compat_parse_qs(video_info_webpage)
1070 if get_video_info.get('use_cipher_signature') != ['True']:
1071 add_dash_mpd(get_video_info)
1072 if not video_info:
1073 video_info = get_video_info
1074 if 'token' in get_video_info:
1075 break
1076 if 'token' not in video_info:
1077 if 'reason' in video_info:
1078 if 'The uploader has not made this video available in your country.' in video_info['reason']:
1079 regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None)
1080 if regions_allowed:
1081 raise ExtractorError('YouTube said: This video is available in %s only' % (
1082 ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))),
1083 expected=True)
1084 raise ExtractorError(
1085 'YouTube said: %s' % video_info['reason'][0],
1086 expected=True, video_id=video_id)
1087 else:
1088 raise ExtractorError(
1089 '"token" parameter not in video info for unknown reason',
1090 video_id=video_id)
1091
1092 # title
1093 if 'title' in video_info:
1094 video_title = video_info['title'][0]
1095 else:
1096 self._downloader.report_warning('Unable to extract video title')
1097 video_title = '_'
1098
1099 # description
1100 video_description = get_element_by_id("eow-description", video_webpage)
1101 if video_description:
1102 video_description = re.sub(r'''(?x)
1103 <a\s+
1104 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1105 title="([^"]+)"\s+
1106 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1107 class="yt-uix-redirect-link"\s*>
1108 [^<]+
1109 </a>
1110 ''', r'\1', video_description)
1111 video_description = clean_html(video_description)
1112 else:
1113 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1114 if fd_mobj:
1115 video_description = unescapeHTML(fd_mobj.group(1))
1116 else:
1117 video_description = ''
1118
1119 if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False):
1120 if not self._downloader.params.get('noplaylist'):
1121 entries = []
1122 feed_ids = []
1123 multifeed_metadata_list = compat_urllib_parse_unquote_plus(video_info['multifeed_metadata_list'][0])
1124 for feed in multifeed_metadata_list.split(','):
1125 feed_data = compat_parse_qs(feed)
1126 entries.append({
1127 '_type': 'url_transparent',
1128 'ie_key': 'Youtube',
1129 'url': smuggle_url(
1130 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1131 {'force_singlefeed': True}),
1132 'title': '%s (%s)' % (video_title, feed_data['title'][0]),
1133 })
1134 feed_ids.append(feed_data['id'][0])
1135 self.to_screen(
1136 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1137 % (', '.join(feed_ids), video_id))
1138 return self.playlist_result(entries, video_id, video_title, video_description)
1139 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1140
1141 if 'view_count' in video_info:
1142 view_count = int(video_info['view_count'][0])
1143 else:
1144 view_count = None
1145
1146 # Check for "rental" videos
1147 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1148 raise ExtractorError('"rental" videos not supported')
1149
1150 # Start extracting information
1151 self.report_information_extraction(video_id)
1152
1153 # uploader
1154 if 'author' not in video_info:
1155 raise ExtractorError('Unable to extract uploader name')
1156 video_uploader = compat_urllib_parse_unquote_plus(video_info['author'][0])
1157
1158 # uploader_id
1159 video_uploader_id = None
1160 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1161 if mobj is not None:
1162 video_uploader_id = mobj.group(1)
1163 else:
1164 self._downloader.report_warning('unable to extract uploader nickname')
1165
1166 # thumbnail image
1167 # We try first to get a high quality image:
1168 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1169 video_webpage, re.DOTALL)
1170 if m_thumb is not None:
1171 video_thumbnail = m_thumb.group(1)
1172 elif 'thumbnail_url' not in video_info:
1173 self._downloader.report_warning('unable to extract video thumbnail')
1174 video_thumbnail = None
1175 else: # don't panic if we can't find it
1176 video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
1177
1178 # upload date
1179 upload_date = self._html_search_meta(
1180 'datePublished', video_webpage, 'upload date', default=None)
1181 if not upload_date:
1182 upload_date = self._search_regex(
1183 [r'(?s)id="eow-date.*?>(.*?)</span>',
1184 r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'],
1185 video_webpage, 'upload date', default=None)
1186 if upload_date:
1187 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1188 upload_date = unified_strdate(upload_date)
1189
1190 m_cat_container = self._search_regex(
1191 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
1192 video_webpage, 'categories', default=None)
1193 if m_cat_container:
1194 category = self._html_search_regex(
1195 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
1196 default=None)
1197 video_categories = None if category is None else [category]
1198 else:
1199 video_categories = None
1200
1201 video_tags = [
1202 unescapeHTML(m.group('content'))
1203 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
1204
1205 def _extract_count(count_name):
1206 return str_to_int(self._search_regex(
1207 r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
1208 % re.escape(count_name),
1209 video_webpage, count_name, default=None))
1210
1211 like_count = _extract_count('like')
1212 dislike_count = _extract_count('dislike')
1213
1214 # subtitles
1215 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1216 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
1217
1218 if 'length_seconds' not in video_info:
1219 self._downloader.report_warning('unable to extract video duration')
1220 video_duration = None
1221 else:
1222 video_duration = int(compat_urllib_parse_unquote_plus(video_info['length_seconds'][0]))
1223
1224 # annotations
1225 video_annotations = None
1226 if self._downloader.params.get('writeannotations', False):
1227 video_annotations = self._extract_annotations(video_id)
1228
1229 def _map_to_format_list(urlmap):
1230 formats = []
1231 for itag, video_real_url in urlmap.items():
1232 dct = {
1233 'format_id': itag,
1234 'url': video_real_url,
1235 'player_url': player_url,
1236 }
1237 if itag in self._formats:
1238 dct.update(self._formats[itag])
1239 formats.append(dct)
1240 return formats
1241
1242 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1243 self.report_rtmp_download()
1244 formats = [{
1245 'format_id': '_rtmp',
1246 'protocol': 'rtmp',
1247 'url': video_info['conn'][0],
1248 'player_url': player_url,
1249 }]
1250 elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
1251 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
1252 if 'rtmpe%3Dyes' in encoded_url_map:
1253 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1254 url_map = {}
1255 for url_data_str in encoded_url_map.split(','):
1256 url_data = compat_parse_qs(url_data_str)
1257 if 'itag' not in url_data or 'url' not in url_data:
1258 continue
1259 format_id = url_data['itag'][0]
1260 url = url_data['url'][0]
1261
1262 if 'sig' in url_data:
1263 url += '&signature=' + url_data['sig'][0]
1264 elif 's' in url_data:
1265 encrypted_sig = url_data['s'][0]
1266 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
1267
1268 jsplayer_url_json = self._search_regex(
1269 ASSETS_RE,
1270 embed_webpage if age_gate else video_webpage,
1271 'JS player URL (1)', default=None)
1272 if not jsplayer_url_json and not age_gate:
1273 # We need the embed website after all
1274 if embed_webpage is None:
1275 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1276 embed_webpage = self._download_webpage(
1277 embed_url, video_id, 'Downloading embed webpage')
1278 jsplayer_url_json = self._search_regex(
1279 ASSETS_RE, embed_webpage, 'JS player URL')
1280
1281 player_url = json.loads(jsplayer_url_json)
1282 if player_url is None:
1283 player_url_json = self._search_regex(
1284 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
1285 video_webpage, 'age gate player URL')
1286 player_url = json.loads(player_url_json)
1287
1288 if self._downloader.params.get('verbose'):
1289 if player_url is None:
1290 player_version = 'unknown'
1291 player_desc = 'unknown'
1292 else:
1293 if player_url.endswith('swf'):
1294 player_version = self._search_regex(
1295 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
1296 'flash player', fatal=False)
1297 player_desc = 'flash player %s' % player_version
1298 else:
1299 player_version = self._search_regex(
1300 r'html5player-([^/]+?)(?:/html5player)?\.js',
1301 player_url,
1302 'html5 player', fatal=False)
1303 player_desc = 'html5 player %s' % player_version
1304
1305 parts_sizes = self._signature_cache_id(encrypted_sig)
1306 self.to_screen('{%s} signature length %s, %s' %
1307 (format_id, parts_sizes, player_desc))
1308
1309 signature = self._decrypt_signature(
1310 encrypted_sig, video_id, player_url, age_gate)
1311 url += '&signature=' + signature
1312 if 'ratebypass' not in url:
1313 url += '&ratebypass=yes'
1314 url_map[format_id] = url
1315 formats = _map_to_format_list(url_map)
1316 elif video_info.get('hlsvp'):
1317 manifest_url = video_info['hlsvp'][0]
1318 url_map = self._extract_from_m3u8(manifest_url, video_id)
1319 formats = _map_to_format_list(url_map)
1320 else:
1321 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1322
1323 # Look for the DASH manifest
1324 if self._downloader.params.get('youtube_include_dash_manifest', True):
1325 dash_mpd_fatal = True
1326 for dash_manifest_url in dash_mpds:
1327 dash_formats = {}
1328 try:
1329 for df in self._parse_dash_manifest(
1330 video_id, dash_manifest_url, player_url, age_gate, dash_mpd_fatal):
1331 # Do not overwrite DASH format found in some previous DASH manifest
1332 if df['format_id'] not in dash_formats:
1333 dash_formats[df['format_id']] = df
1334 # Additional DASH manifests may end up in HTTP Error 403 therefore
1335 # allow them to fail without bug report message if we already have
1336 # some DASH manifest succeeded. This is temporary workaround to reduce
1337 # burst of bug reports until we figure out the reason and whether it
1338 # can be fixed at all.
1339 dash_mpd_fatal = False
1340 except (ExtractorError, KeyError) as e:
1341 self.report_warning(
1342 'Skipping DASH manifest: %r' % e, video_id)
1343 if dash_formats:
1344 # Remove the formats we found through non-DASH, they
1345 # contain less info and it can be wrong, because we use
1346 # fixed values (for example the resolution). See
1347 # https://github.com/rg3/youtube-dl/issues/5774 for an
1348 # example.
1349 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
1350 formats.extend(dash_formats.values())
1351
1352 # Check for malformed aspect ratio
1353 stretched_m = re.search(
1354 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
1355 video_webpage)
1356 if stretched_m:
1357 ratio = float(stretched_m.group('w')) / float(stretched_m.group('h'))
1358 for f in formats:
1359 if f.get('vcodec') != 'none':
1360 f['stretched_ratio'] = ratio
1361
1362 self._sort_formats(formats)
1363
1364 return {
1365 'id': video_id,
1366 'uploader': video_uploader,
1367 'uploader_id': video_uploader_id,
1368 'upload_date': upload_date,
1369 'title': video_title,
1370 'thumbnail': video_thumbnail,
1371 'description': video_description,
1372 'categories': video_categories,
1373 'tags': video_tags,
1374 'subtitles': video_subtitles,
1375 'automatic_captions': automatic_captions,
1376 'duration': video_duration,
1377 'age_limit': 18 if age_gate else 0,
1378 'annotations': video_annotations,
1379 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1380 'view_count': view_count,
1381 'like_count': like_count,
1382 'dislike_count': dislike_count,
1383 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
1384 'formats': formats,
1385 'is_live': is_live,
1386 'start_time': start_time,
1387 'end_time': end_time,
1388 }
1389
1390
1391 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1392 IE_DESC = 'YouTube.com playlists'
1393 _VALID_URL = r"""(?x)(?:
1394 (?:https?://)?
1395 (?:\w+\.)?
1396 youtube\.com/
1397 (?:
1398 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
1399 \? (?:.*?&)*? (?:p|a|list)=
1400 | p/
1401 )
1402 (
1403 (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,}
1404 # Top tracks, they can also include dots
1405 |(?:MC)[\w\.]*
1406 )
1407 .*
1408 |
1409 ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})
1410 )"""
1411 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1412 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1413 IE_NAME = 'youtube:playlist'
1414 _TESTS = [{
1415 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1416 'info_dict': {
1417 'title': 'ytdl test PL',
1418 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1419 },
1420 'playlist_count': 3,
1421 }, {
1422 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1423 'info_dict': {
1424 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1425 'title': 'YDL_Empty_List',
1426 },
1427 'playlist_count': 0,
1428 }, {
1429 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1430 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1431 'info_dict': {
1432 'title': '29C3: Not my department',
1433 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1434 },
1435 'playlist_count': 95,
1436 }, {
1437 'note': 'issue #673',
1438 'url': 'PLBB231211A4F62143',
1439 'info_dict': {
1440 'title': '[OLD]Team Fortress 2 (Class-based LP)',
1441 'id': 'PLBB231211A4F62143',
1442 },
1443 'playlist_mincount': 26,
1444 }, {
1445 'note': 'Large playlist',
1446 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1447 'info_dict': {
1448 'title': 'Uploads from Cauchemar',
1449 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
1450 },
1451 'playlist_mincount': 799,
1452 }, {
1453 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1454 'info_dict': {
1455 'title': 'YDL_safe_search',
1456 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1457 },
1458 'playlist_count': 2,
1459 }, {
1460 'note': 'embedded',
1461 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1462 'playlist_count': 4,
1463 'info_dict': {
1464 'title': 'JODA15',
1465 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1466 }
1467 }, {
1468 'note': 'Embedded SWF player',
1469 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1470 'playlist_count': 4,
1471 'info_dict': {
1472 'title': 'JODA7',
1473 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
1474 }
1475 }, {
1476 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
1477 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
1478 'info_dict': {
1479 'title': 'Uploads from Interstellar Movie',
1480 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
1481 },
1482 'playlist_mincout': 21,
1483 }]
1484
1485 def _real_initialize(self):
1486 self._login()
1487
1488 def _extract_mix(self, playlist_id):
1489 # The mixes are generated from a single video
1490 # the id of the playlist is just 'RD' + video_id
1491 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1492 webpage = self._download_webpage(
1493 url, playlist_id, 'Downloading Youtube mix')
1494 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1495 title_span = (
1496 search_title('playlist-title') or
1497 search_title('title long-title') or
1498 search_title('title'))
1499 title = clean_html(title_span)
1500 ids = orderedSet(re.findall(
1501 r'''(?xs)data-video-username=".*?".*?
1502 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1503 webpage))
1504 url_results = self._ids_to_results(ids)
1505
1506 return self.playlist_result(url_results, playlist_id, title)
1507
1508 def _extract_playlist(self, playlist_id):
1509 url = self._TEMPLATE_URL % playlist_id
1510 page = self._download_webpage(url, playlist_id)
1511
1512 for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page):
1513 match = match.strip()
1514 # Check if the playlist exists or is private
1515 if re.match(r'[^<]*(The|This) playlist (does not exist|is private)[^<]*', match):
1516 raise ExtractorError(
1517 'The playlist doesn\'t exist or is private, use --username or '
1518 '--netrc to access it.',
1519 expected=True)
1520 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
1521 raise ExtractorError(
1522 'Invalid parameters. Maybe URL is incorrect.',
1523 expected=True)
1524 elif re.match(r'[^<]*Choose your language[^<]*', match):
1525 continue
1526 else:
1527 self.report_warning('Youtube gives an alert message: ' + match)
1528
1529 # Extract the video ids from the playlist pages
1530 def _entries():
1531 more_widget_html = content_html = page
1532 for page_num in itertools.count(1):
1533 matches = re.finditer(self._VIDEO_RE, content_html)
1534 # We remove the duplicates and the link with index 0
1535 # (it's not the first video of the playlist)
1536 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1537 for vid_id in new_ids:
1538 yield self.url_result(vid_id, 'Youtube', video_id=vid_id)
1539
1540 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1541 if not mobj:
1542 break
1543
1544 more = self._download_json(
1545 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1546 'Downloading page #%s' % page_num,
1547 transform_source=uppercase_escape)
1548 content_html = more['content_html']
1549 if not content_html.strip():
1550 # Some webpages show a "Load more" button but they don't
1551 # have more videos
1552 break
1553 more_widget_html = more['load_more_widget_html']
1554
1555 playlist_title = self._html_search_regex(
1556 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1557 page, 'title')
1558
1559 return self.playlist_result(_entries(), playlist_id, playlist_title)
1560
1561 def _real_extract(self, url):
1562 # Extract playlist id
1563 mobj = re.match(self._VALID_URL, url)
1564 if mobj is None:
1565 raise ExtractorError('Invalid URL: %s' % url)
1566 playlist_id = mobj.group(1) or mobj.group(2)
1567
1568 # Check if it's a video-specific URL
1569 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1570 if 'v' in query_dict:
1571 video_id = query_dict['v'][0]
1572 if self._downloader.params.get('noplaylist'):
1573 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1574 return self.url_result(video_id, 'Youtube', video_id=video_id)
1575 else:
1576 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1577
1578 if playlist_id.startswith('RD') or playlist_id.startswith('UL'):
1579 # Mixes require a custom extraction process
1580 return self._extract_mix(playlist_id)
1581
1582 return self._extract_playlist(playlist_id)
1583
1584
1585 class YoutubeChannelIE(InfoExtractor):
1586 IE_DESC = 'YouTube.com channels'
1587 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
1588 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
1589 IE_NAME = 'youtube:channel'
1590 _TESTS = [{
1591 'note': 'paginated channel',
1592 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1593 'playlist_mincount': 91,
1594 'info_dict': {
1595 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
1596 }
1597 }]
1598
1599 @staticmethod
1600 def extract_videos_from_page(page):
1601 ids_in_page = []
1602 titles_in_page = []
1603 for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page):
1604 video_id = mobj.group('id')
1605 video_title = unescapeHTML(mobj.group('title'))
1606 try:
1607 idx = ids_in_page.index(video_id)
1608 if video_title and not titles_in_page[idx]:
1609 titles_in_page[idx] = video_title
1610 except ValueError:
1611 ids_in_page.append(video_id)
1612 titles_in_page.append(video_title)
1613 return zip(ids_in_page, titles_in_page)
1614
1615 def _real_extract(self, url):
1616 channel_id = self._match_id(url)
1617
1618 url = self._TEMPLATE_URL % channel_id
1619
1620 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
1621 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
1622 # otherwise fallback on channel by page extraction
1623 channel_page = self._download_webpage(
1624 url + '?view=57', channel_id,
1625 'Downloading channel page', fatal=False)
1626 channel_playlist_id = self._html_search_meta(
1627 'channelId', channel_page, 'channel id', default=None)
1628 if not channel_playlist_id:
1629 channel_playlist_id = self._search_regex(
1630 r'data-channel-external-id="([^"]+)"',
1631 channel_page, 'channel id', default=None)
1632 if channel_playlist_id and channel_playlist_id.startswith('UC'):
1633 playlist_id = 'UU' + channel_playlist_id[2:]
1634 return self.url_result(
1635 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
1636
1637 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
1638 autogenerated = re.search(r'''(?x)
1639 class="[^"]*?(?:
1640 channel-header-autogenerated-label|
1641 yt-channel-title-autogenerated
1642 )[^"]*"''', channel_page) is not None
1643
1644 if autogenerated:
1645 # The videos are contained in a single page
1646 # the ajax pages can't be used, they are empty
1647 entries = [
1648 self.url_result(
1649 video_id, 'Youtube', video_id=video_id,
1650 video_title=video_title)
1651 for video_id, video_title in self.extract_videos_from_page(channel_page)]
1652 return self.playlist_result(entries, channel_id)
1653
1654 def _entries():
1655 more_widget_html = content_html = channel_page
1656 for pagenum in itertools.count(1):
1657
1658 for video_id, video_title in self.extract_videos_from_page(content_html):
1659 yield self.url_result(
1660 video_id, 'Youtube', video_id=video_id,
1661 video_title=video_title)
1662
1663 mobj = re.search(
1664 r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
1665 more_widget_html)
1666 if not mobj:
1667 break
1668
1669 more = self._download_json(
1670 'https://youtube.com/%s' % mobj.group('more'), channel_id,
1671 'Downloading page #%s' % (pagenum + 1),
1672 transform_source=uppercase_escape)
1673 content_html = more['content_html']
1674 more_widget_html = more['load_more_widget_html']
1675
1676 return self.playlist_result(_entries(), channel_id)
1677
1678
1679 class YoutubeUserIE(YoutubeChannelIE):
1680 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
1681 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
1682 _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos'
1683 IE_NAME = 'youtube:user'
1684
1685 _TESTS = [{
1686 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1687 'playlist_mincount': 320,
1688 'info_dict': {
1689 'title': 'TheLinuxFoundation',
1690 }
1691 }, {
1692 'url': 'ytuser:phihag',
1693 'only_matching': True,
1694 }]
1695
1696 @classmethod
1697 def suitable(cls, url):
1698 # Don't return True if the url can be extracted with other youtube
1699 # extractor, the regex would is too permissive and it would match.
1700 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1701 if any(ie.suitable(url) for ie in other_ies):
1702 return False
1703 else:
1704 return super(YoutubeUserIE, cls).suitable(url)
1705
1706
1707 class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
1708 IE_DESC = 'YouTube.com searches'
1709 # there doesn't appear to be a real limit, for example if you search for
1710 # 'python' you get more than 8.000.000 results
1711 _MAX_RESULTS = float('inf')
1712 IE_NAME = 'youtube:search'
1713 _SEARCH_KEY = 'ytsearch'
1714 _EXTRA_QUERY_ARGS = {}
1715 _TESTS = []
1716
1717 def _get_n_results(self, query, n):
1718 """Get a specified number of results for a query"""
1719
1720 videos = []
1721 limit = n
1722
1723 for pagenum in itertools.count(1):
1724 url_query = {
1725 'search_query': query.encode('utf-8'),
1726 'page': pagenum,
1727 'spf': 'navigate',
1728 }
1729 url_query.update(self._EXTRA_QUERY_ARGS)
1730 result_url = 'https://www.youtube.com/results?' + compat_urllib_parse.urlencode(url_query)
1731 data = self._download_json(
1732 result_url, video_id='query "%s"' % query,
1733 note='Downloading page %s' % pagenum,
1734 errnote='Unable to download API page')
1735 html_content = data[1]['body']['content']
1736
1737 if 'class="search-message' in html_content:
1738 raise ExtractorError(
1739 '[youtube] No video results', expected=True)
1740
1741 new_videos = self._ids_to_results(orderedSet(re.findall(
1742 r'href="/watch\?v=(.{11})', html_content)))
1743 videos += new_videos
1744 if not new_videos or len(videos) > limit:
1745 break
1746
1747 if len(videos) > n:
1748 videos = videos[:n]
1749 return self.playlist_result(videos, query)
1750
1751
1752 class YoutubeSearchDateIE(YoutubeSearchIE):
1753 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1754 _SEARCH_KEY = 'ytsearchdate'
1755 IE_DESC = 'YouTube.com searches, newest videos first'
1756 _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
1757
1758
1759 class YoutubeSearchURLIE(InfoExtractor):
1760 IE_DESC = 'YouTube.com search URLs'
1761 IE_NAME = 'youtube:search_url'
1762 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1763 _TESTS = [{
1764 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1765 'playlist_mincount': 5,
1766 'info_dict': {
1767 'title': 'youtube-dl test video',
1768 }
1769 }]
1770
1771 def _real_extract(self, url):
1772 mobj = re.match(self._VALID_URL, url)
1773 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
1774
1775 webpage = self._download_webpage(url, query)
1776 result_code = self._search_regex(
1777 r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML')
1778
1779 part_codes = re.findall(
1780 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1781 entries = []
1782 for part_code in part_codes:
1783 part_title = self._html_search_regex(
1784 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1785 part_url_snippet = self._html_search_regex(
1786 r'(?s)href="([^"]+)"', part_code, 'item URL')
1787 part_url = compat_urlparse.urljoin(
1788 'https://www.youtube.com/', part_url_snippet)
1789 entries.append({
1790 '_type': 'url',
1791 'url': part_url,
1792 'title': part_title,
1793 })
1794
1795 return {
1796 '_type': 'playlist',
1797 'entries': entries,
1798 'title': query,
1799 }
1800
1801
1802 class YoutubeShowIE(InfoExtractor):
1803 IE_DESC = 'YouTube.com (multi-season) shows'
1804 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
1805 IE_NAME = 'youtube:show'
1806 _TESTS = [{
1807 'url': 'http://www.youtube.com/show/airdisasters',
1808 'playlist_mincount': 3,
1809 'info_dict': {
1810 'id': 'airdisasters',
1811 'title': 'Air Disasters',
1812 }
1813 }]
1814
1815 def _real_extract(self, url):
1816 mobj = re.match(self._VALID_URL, url)
1817 playlist_id = mobj.group('id')
1818 webpage = self._download_webpage(
1819 url, playlist_id, 'Downloading show webpage')
1820 # There's one playlist for each season of the show
1821 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1822 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1823 entries = [
1824 self.url_result(
1825 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1826 for season in m_seasons
1827 ]
1828 title = self._og_search_title(webpage, fatal=False)
1829
1830 return {
1831 '_type': 'playlist',
1832 'id': playlist_id,
1833 'title': title,
1834 'entries': entries,
1835 }
1836
1837
1838 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1839 """
1840 Base class for feed extractors
1841 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1842 """
1843 _LOGIN_REQUIRED = True
1844
1845 @property
1846 def IE_NAME(self):
1847 return 'youtube:%s' % self._FEED_NAME
1848
1849 def _real_initialize(self):
1850 self._login()
1851
1852 def _real_extract(self, url):
1853 page = self._download_webpage(
1854 'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)
1855
1856 # The extraction process is the same as for playlists, but the regex
1857 # for the video ids doesn't contain an index
1858 ids = []
1859 more_widget_html = content_html = page
1860 for page_num in itertools.count(1):
1861 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1862
1863 # 'recommended' feed has infinite 'load more' and each new portion spins
1864 # the same videos in (sometimes) slightly different order, so we'll check
1865 # for unicity and break when portion has no new videos
1866 new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))
1867 if not new_ids:
1868 break
1869
1870 ids.extend(new_ids)
1871
1872 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1873 if not mobj:
1874 break
1875
1876 more = self._download_json(
1877 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
1878 'Downloading page #%s' % page_num,
1879 transform_source=uppercase_escape)
1880 content_html = more['content_html']
1881 more_widget_html = more['load_more_widget_html']
1882
1883 return self.playlist_result(
1884 self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE)
1885
1886
1887 class YoutubeWatchLaterIE(YoutubePlaylistIE):
1888 IE_NAME = 'youtube:watchlater'
1889 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
1890 _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'
1891
1892 _TESTS = [] # override PlaylistIE tests
1893
1894 def _real_extract(self, url):
1895 return self._extract_playlist('WL')
1896
1897
1898 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1899 IE_NAME = 'youtube:favorites'
1900 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
1901 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1902 _LOGIN_REQUIRED = True
1903
1904 def _real_extract(self, url):
1905 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1906 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
1907 return self.url_result(playlist_id, 'YoutubePlaylist')
1908
1909
1910 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1911 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
1912 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1913 _FEED_NAME = 'recommended'
1914 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1915
1916
1917 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1918 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1919 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1920 _FEED_NAME = 'subscriptions'
1921 _PLAYLIST_TITLE = 'Youtube Subscriptions'
1922
1923
1924 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1925 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
1926 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1927 _FEED_NAME = 'history'
1928 _PLAYLIST_TITLE = 'Youtube History'
1929
1930
1931 class YoutubeTruncatedURLIE(InfoExtractor):
1932 IE_NAME = 'youtube:truncated_url'
1933 IE_DESC = False # Do not list
1934 _VALID_URL = r'''(?x)
1935 (?:https?://)?
1936 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
1937 (?:watch\?(?:
1938 feature=[a-z_]+|
1939 annotation_id=annotation_[^&]+|
1940 x-yt-cl=[0-9]+|
1941 hl=[^&]*|
1942 )?
1943 |
1944 attribution_link\?a=[^&]+
1945 )
1946 $
1947 '''
1948
1949 _TESTS = [{
1950 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1951 'only_matching': True,
1952 }, {
1953 'url': 'http://www.youtube.com/watch?',
1954 'only_matching': True,
1955 }, {
1956 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
1957 'only_matching': True,
1958 }, {
1959 'url': 'https://www.youtube.com/watch?feature=foo',
1960 'only_matching': True,
1961 }, {
1962 'url': 'https://www.youtube.com/watch?hl=en-GB',
1963 'only_matching': True,
1964 }]
1965
1966 def _real_extract(self, url):
1967 raise ExtractorError(
1968 'Did you forget to quote the URL? Remember that & is a meta '
1969 'character in most shells, so you want to put the URL in quotes, '
1970 'like youtube-dl '
1971 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1972 ' or simply youtube-dl BaW_jenozKc .',
1973 expected=True)
1974
1975
1976 class YoutubeTruncatedIDIE(InfoExtractor):
1977 IE_NAME = 'youtube:truncated_id'
1978 IE_DESC = False # Do not list
1979 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
1980
1981 _TESTS = [{
1982 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
1983 'only_matching': True,
1984 }]
1985
1986 def _real_extract(self, url):
1987 video_id = self._match_id(url)
1988 raise ExtractorError(
1989 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
1990 expected=True)