]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/youtube.py
[canalplus] Extend video id regex (Closes #7076)
[yt-dlp.git] / youtube_dl / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5
6 import itertools
7 import json
8 import os.path
9 import re
10 import time
11 import traceback
12
13 from .common import InfoExtractor, SearchInfoExtractor
14 from ..jsinterp import JSInterpreter
15 from ..swfinterp import SWFInterpreter
16 from ..compat import (
17 compat_chr,
18 compat_parse_qs,
19 compat_urllib_parse,
20 compat_urllib_parse_unquote,
21 compat_urllib_parse_unquote_plus,
22 compat_urllib_parse_urlparse,
23 compat_urllib_request,
24 compat_urlparse,
25 compat_str,
26 )
27 from ..utils import (
28 clean_html,
29 encode_dict,
30 ExtractorError,
31 float_or_none,
32 get_element_by_attribute,
33 get_element_by_id,
34 int_or_none,
35 orderedSet,
36 parse_duration,
37 remove_start,
38 smuggle_url,
39 str_to_int,
40 unescapeHTML,
41 unified_strdate,
42 unsmuggle_url,
43 uppercase_escape,
44 ISO3166Utils,
45 )
46
47
48 class YoutubeBaseInfoExtractor(InfoExtractor):
49 """Provide base functions for Youtube extractors"""
50 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
51 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
52 _NETRC_MACHINE = 'youtube'
53 # If True it will raise an error if no login info is provided
54 _LOGIN_REQUIRED = False
55
56 def _set_language(self):
57 self._set_cookie(
58 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
59 # YouTube sets the expire time to about two months
60 expire_time=time.time() + 2 * 30 * 24 * 3600)
61
62 def _ids_to_results(self, ids):
63 return [
64 self.url_result(vid_id, 'Youtube', video_id=vid_id)
65 for vid_id in ids]
66
67 def _login(self):
68 """
69 Attempt to log in to YouTube.
70 True is returned if successful or skipped.
71 False is returned if login failed.
72
73 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
74 """
75 (username, password) = self._get_login_info()
76 # No authentication to be performed
77 if username is None:
78 if self._LOGIN_REQUIRED:
79 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
80 return True
81
82 login_page = self._download_webpage(
83 self._LOGIN_URL, None,
84 note='Downloading login page',
85 errnote='unable to fetch login page', fatal=False)
86 if login_page is False:
87 return
88
89 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
90 login_page, 'Login GALX parameter')
91
92 # Log in
93 login_form_strs = {
94 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
95 'Email': username,
96 'GALX': galx,
97 'Passwd': password,
98
99 'PersistentCookie': 'yes',
100 '_utf8': '霱',
101 'bgresponse': 'js_disabled',
102 'checkConnection': '',
103 'checkedDomains': 'youtube',
104 'dnConn': '',
105 'pstMsg': '0',
106 'rmShown': '1',
107 'secTok': '',
108 'signIn': 'Sign in',
109 'timeStmp': '',
110 'service': 'youtube',
111 'uilel': '3',
112 'hl': 'en_US',
113 }
114
115 login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('ascii')
116
117 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
118 login_results = self._download_webpage(
119 req, None,
120 note='Logging in', errnote='unable to log in', fatal=False)
121 if login_results is False:
122 return False
123
124 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
125 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
126
127 # Two-Factor
128 # TODO add SMS and phone call support - these require making a request and then prompting the user
129
130 if re.search(r'(?i)<form[^>]* id="challenge"', login_results) is not None:
131 tfa_code = self._get_tfa_info('2-step verification code')
132
133 if not tfa_code:
134 self._downloader.report_warning(
135 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
136 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
137 return False
138
139 tfa_code = remove_start(tfa_code, 'G-')
140
141 tfa_form_strs = self._form_hidden_inputs('challenge', login_results)
142
143 tfa_form_strs.update({
144 'Pin': tfa_code,
145 'TrustDevice': 'on',
146 })
147
148 tfa_data = compat_urllib_parse.urlencode(encode_dict(tfa_form_strs)).encode('ascii')
149
150 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
151 tfa_results = self._download_webpage(
152 tfa_req, None,
153 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
154
155 if tfa_results is False:
156 return False
157
158 if re.search(r'(?i)<form[^>]* id="challenge"', tfa_results) is not None:
159 self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.')
160 return False
161 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
162 self._downloader.report_warning('unable to log in - did the page structure change?')
163 return False
164 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
165 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
166 return False
167
168 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
169 self._downloader.report_warning('unable to log in: bad username or password')
170 return False
171 return True
172
173 def _real_initialize(self):
174 if self._downloader is None:
175 return
176 self._set_language()
177 if not self._login():
178 return
179
180
181 class YoutubeIE(YoutubeBaseInfoExtractor):
182 IE_DESC = 'YouTube.com'
183 _VALID_URL = r"""(?x)^
184 (
185 (?:https?://|//) # http(s):// or protocol-independent URL
186 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
187 (?:www\.)?deturl\.com/www\.youtube\.com/|
188 (?:www\.)?pwnyoutube\.com/|
189 (?:www\.)?yourepeat\.com/|
190 tube\.majestyc\.net/|
191 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
192 (?:.*?\#/)? # handle anchor (#/) redirect urls
193 (?: # the various things that can precede the ID:
194 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
195 |(?: # or the v= param in all its forms
196 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
197 (?:\?|\#!?) # the params delimiter ? or # or #!
198 (?:.*?&)?? # any other preceding param (like /?s=tuff&v=xxxx)
199 v=
200 )
201 ))
202 |(?:
203 youtu\.be| # just youtu.be/xxxx
204 vid\.plus # or vid.plus/xxxx
205 )/
206 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
207 )
208 )? # all until now is optional -> you can pass the naked ID
209 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
210 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
211 (?(1).+)? # if we found the ID, everything can follow
212 $"""
213 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
214 _formats = {
215 '5': {'ext': 'flv', 'width': 400, 'height': 240},
216 '6': {'ext': 'flv', 'width': 450, 'height': 270},
217 '13': {'ext': '3gp'},
218 '17': {'ext': '3gp', 'width': 176, 'height': 144},
219 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
220 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
221 '34': {'ext': 'flv', 'width': 640, 'height': 360},
222 '35': {'ext': 'flv', 'width': 854, 'height': 480},
223 '36': {'ext': '3gp', 'width': 320, 'height': 240},
224 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
225 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
226 '43': {'ext': 'webm', 'width': 640, 'height': 360},
227 '44': {'ext': 'webm', 'width': 854, 'height': 480},
228 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
229 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
230 '59': {'ext': 'mp4', 'width': 854, 'height': 480},
231 '78': {'ext': 'mp4', 'width': 854, 'height': 480},
232
233
234 # 3d videos
235 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
236 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
237 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
238 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
239 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
240 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
241 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
242
243 # Apple HTTP Live Streaming
244 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
245 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
246 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
247 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
248 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
249 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
250 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
251
252 # DASH mp4 video
253 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
254 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
255 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
256 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
257 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
258 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
259 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
260 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
261 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
262 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
263 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
264
265 # Dash mp4 audio
266 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
267 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
268 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
269
270 # Dash webm
271 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
272 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
273 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
274 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
275 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
276 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
277 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'vp9'},
278 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
279 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
280 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
281 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
282 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
283 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
284 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
285 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
286 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
287 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
288 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
289 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
290 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'vp9'},
291 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
292
293 # Dash webm audio
294 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
295 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
296
297 # Dash webm audio with opus inside
298 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
299 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
300 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
301
302 # RTMP (unnamed)
303 '_rtmp': {'protocol': 'rtmp'},
304 }
305
306 IE_NAME = 'youtube'
307 _TESTS = [
308 {
309 'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&t=1s&end=9',
310 'info_dict': {
311 'id': 'BaW_jenozKc',
312 'ext': 'mp4',
313 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
314 'uploader': 'Philipp Hagemeister',
315 'uploader_id': 'phihag',
316 'upload_date': '20121002',
317 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
318 'categories': ['Science & Technology'],
319 'tags': ['youtube-dl'],
320 'like_count': int,
321 'dislike_count': int,
322 'start_time': 1,
323 'end_time': 9,
324 }
325 },
326 {
327 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
328 'note': 'Test generic use_cipher_signature video (#897)',
329 'info_dict': {
330 'id': 'UxxajLWwzqY',
331 'ext': 'mp4',
332 'upload_date': '20120506',
333 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
334 'description': 'md5:782e8651347686cba06e58f71ab51773',
335 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
336 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
337 'iconic ep', 'iconic', 'love', 'it'],
338 'uploader': 'Icona Pop',
339 'uploader_id': 'IconaPop',
340 }
341 },
342 {
343 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
344 'note': 'Test VEVO video with age protection (#956)',
345 'info_dict': {
346 'id': '07FYdnEawAQ',
347 'ext': 'mp4',
348 'upload_date': '20130703',
349 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
350 'description': 'md5:64249768eec3bc4276236606ea996373',
351 'uploader': 'justintimberlakeVEVO',
352 'uploader_id': 'justintimberlakeVEVO',
353 'age_limit': 18,
354 }
355 },
356 {
357 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
358 'note': 'Embed-only video (#1746)',
359 'info_dict': {
360 'id': 'yZIXLfi8CZQ',
361 'ext': 'mp4',
362 'upload_date': '20120608',
363 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
364 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
365 'uploader': 'SET India',
366 'uploader_id': 'setindia'
367 }
368 },
369 {
370 'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&v=UxxajLWwzqY',
371 'note': 'Use the first video ID in the URL',
372 'info_dict': {
373 'id': 'BaW_jenozKc',
374 'ext': 'mp4',
375 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
376 'uploader': 'Philipp Hagemeister',
377 'uploader_id': 'phihag',
378 'upload_date': '20121002',
379 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
380 'categories': ['Science & Technology'],
381 'tags': ['youtube-dl'],
382 'like_count': int,
383 'dislike_count': int,
384 },
385 'params': {
386 'skip_download': True,
387 },
388 },
389 {
390 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
391 'note': '256k DASH audio (format 141) via DASH manifest',
392 'info_dict': {
393 'id': 'a9LDPn-MO4I',
394 'ext': 'm4a',
395 'upload_date': '20121002',
396 'uploader_id': '8KVIDEO',
397 'description': '',
398 'uploader': '8KVIDEO',
399 'title': 'UHDTV TEST 8K VIDEO.mp4'
400 },
401 'params': {
402 'youtube_include_dash_manifest': True,
403 'format': '141',
404 },
405 },
406 # DASH manifest with encrypted signature
407 {
408 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
409 'info_dict': {
410 'id': 'IB3lcPjvWLA',
411 'ext': 'm4a',
412 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
413 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
414 'uploader': 'AfrojackVEVO',
415 'uploader_id': 'AfrojackVEVO',
416 'upload_date': '20131011',
417 },
418 'params': {
419 'youtube_include_dash_manifest': True,
420 'format': '141',
421 },
422 },
423 # JS player signature function name containing $
424 {
425 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
426 'info_dict': {
427 'id': 'nfWlot6h_JM',
428 'ext': 'm4a',
429 'title': 'Taylor Swift - Shake It Off',
430 'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3',
431 'uploader': 'TaylorSwiftVEVO',
432 'uploader_id': 'TaylorSwiftVEVO',
433 'upload_date': '20140818',
434 },
435 'params': {
436 'youtube_include_dash_manifest': True,
437 'format': '141',
438 },
439 },
440 # Controversy video
441 {
442 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
443 'info_dict': {
444 'id': 'T4XJQO3qol8',
445 'ext': 'mp4',
446 'upload_date': '20100909',
447 'uploader': 'The Amazing Atheist',
448 'uploader_id': 'TheAmazingAtheist',
449 'title': 'Burning Everyone\'s Koran',
450 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
451 }
452 },
453 # Normal age-gate video (No vevo, embed allowed)
454 {
455 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
456 'info_dict': {
457 'id': 'HtVdAasjOgU',
458 'ext': 'mp4',
459 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
460 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
461 'uploader': 'The Witcher',
462 'uploader_id': 'WitcherGame',
463 'upload_date': '20140605',
464 'age_limit': 18,
465 },
466 },
467 # Age-gate video with encrypted signature
468 {
469 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',
470 'info_dict': {
471 'id': '6kLq3WMV1nU',
472 'ext': 'mp4',
473 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
474 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
475 'uploader': 'LloydVEVO',
476 'uploader_id': 'LloydVEVO',
477 'upload_date': '20110629',
478 'age_limit': 18,
479 },
480 },
481 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
482 {
483 'url': '__2ABJjxzNo',
484 'info_dict': {
485 'id': '__2ABJjxzNo',
486 'ext': 'mp4',
487 'upload_date': '20100430',
488 'uploader_id': 'deadmau5',
489 'description': 'md5:12c56784b8032162bb936a5f76d55360',
490 'uploader': 'deadmau5',
491 'title': 'Deadmau5 - Some Chords (HD)',
492 },
493 'expected_warnings': [
494 'DASH manifest missing',
495 ]
496 },
497 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
498 {
499 'url': 'lqQg6PlCWgI',
500 'info_dict': {
501 'id': 'lqQg6PlCWgI',
502 'ext': 'mp4',
503 'upload_date': '20120724',
504 'uploader_id': 'olympic',
505 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
506 'uploader': 'Olympics',
507 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
508 },
509 'params': {
510 'skip_download': 'requires avconv',
511 }
512 },
513 # Non-square pixels
514 {
515 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
516 'info_dict': {
517 'id': '_b-2C3KPAM0',
518 'ext': 'mp4',
519 'stretched_ratio': 16 / 9.,
520 'upload_date': '20110310',
521 'uploader_id': 'AllenMeow',
522 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
523 'uploader': '孫艾倫',
524 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
525 },
526 },
527 # url_encoded_fmt_stream_map is empty string
528 {
529 'url': 'qEJwOuvDf7I',
530 'info_dict': {
531 'id': 'qEJwOuvDf7I',
532 'ext': 'webm',
533 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
534 'description': '',
535 'upload_date': '20150404',
536 'uploader_id': 'spbelect',
537 'uploader': 'Наблюдатели Петербурга',
538 },
539 'params': {
540 'skip_download': 'requires avconv',
541 }
542 },
543 # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)
544 {
545 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
546 'info_dict': {
547 'id': 'FIl7x6_3R5Y',
548 'ext': 'mp4',
549 'title': 'md5:7b81415841e02ecd4313668cde88737a',
550 'description': 'md5:116377fd2963b81ec4ce64b542173306',
551 'upload_date': '20150625',
552 'uploader_id': 'dorappi2000',
553 'uploader': 'dorappi2000',
554 'formats': 'mincount:33',
555 },
556 },
557 # DASH manifest with segment_list
558 {
559 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
560 'md5': '8ce563a1d667b599d21064e982ab9e31',
561 'info_dict': {
562 'id': 'CsmdDsKjzN8',
563 'ext': 'mp4',
564 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
565 'uploader': 'Airtek',
566 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
567 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
568 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
569 },
570 'params': {
571 'youtube_include_dash_manifest': True,
572 'format': '135', # bestvideo
573 }
574 },
575 {
576 # Multifeed videos (multiple cameras), URL is for Main Camera
577 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
578 'info_dict': {
579 'id': 'jqWvoWXjCVs',
580 'title': 'teamPGP: Rocket League Noob Stream',
581 'description': 'md5:dc7872fb300e143831327f1bae3af010',
582 },
583 'playlist': [{
584 'info_dict': {
585 'id': 'jqWvoWXjCVs',
586 'ext': 'mp4',
587 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
588 'description': 'md5:dc7872fb300e143831327f1bae3af010',
589 'upload_date': '20150721',
590 'uploader': 'Beer Games Beer',
591 'uploader_id': 'beergamesbeer',
592 },
593 }, {
594 'info_dict': {
595 'id': '6h8e8xoXJzg',
596 'ext': 'mp4',
597 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
598 'description': 'md5:dc7872fb300e143831327f1bae3af010',
599 'upload_date': '20150721',
600 'uploader': 'Beer Games Beer',
601 'uploader_id': 'beergamesbeer',
602 },
603 }, {
604 'info_dict': {
605 'id': 'PUOgX5z9xZw',
606 'ext': 'mp4',
607 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
608 'description': 'md5:dc7872fb300e143831327f1bae3af010',
609 'upload_date': '20150721',
610 'uploader': 'Beer Games Beer',
611 'uploader_id': 'beergamesbeer',
612 },
613 }, {
614 'info_dict': {
615 'id': 'teuwxikvS5k',
616 'ext': 'mp4',
617 'title': 'teamPGP: Rocket League Noob Stream (zim)',
618 'description': 'md5:dc7872fb300e143831327f1bae3af010',
619 'upload_date': '20150721',
620 'uploader': 'Beer Games Beer',
621 'uploader_id': 'beergamesbeer',
622 },
623 }],
624 'params': {
625 'skip_download': True,
626 },
627 },
628 {
629 'url': 'http://vid.plus/FlRa-iH7PGw',
630 'only_matching': True,
631 }
632 ]
633
634 def __init__(self, *args, **kwargs):
635 super(YoutubeIE, self).__init__(*args, **kwargs)
636 self._player_cache = {}
637
638 def report_video_info_webpage_download(self, video_id):
639 """Report attempt to download video info webpage."""
640 self.to_screen('%s: Downloading video info webpage' % video_id)
641
642 def report_information_extraction(self, video_id):
643 """Report attempt to extract video information."""
644 self.to_screen('%s: Extracting video information' % video_id)
645
646 def report_unavailable_format(self, video_id, format):
647 """Report extracted video URL."""
648 self.to_screen('%s: Format %s not available' % (video_id, format))
649
650 def report_rtmp_download(self):
651 """Indicate the download will use the RTMP protocol."""
652 self.to_screen('RTMP download detected')
653
654 def _signature_cache_id(self, example_sig):
655 """ Return a string representation of a signature """
656 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
657
658 def _extract_signature_function(self, video_id, player_url, example_sig):
659 id_m = re.match(
660 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?)?\.(?P<ext>[a-z]+)$',
661 player_url)
662 if not id_m:
663 raise ExtractorError('Cannot identify player %r' % player_url)
664 player_type = id_m.group('ext')
665 player_id = id_m.group('id')
666
667 # Read from filesystem cache
668 func_id = '%s_%s_%s' % (
669 player_type, player_id, self._signature_cache_id(example_sig))
670 assert os.path.basename(func_id) == func_id
671
672 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
673 if cache_spec is not None:
674 return lambda s: ''.join(s[i] for i in cache_spec)
675
676 download_note = (
677 'Downloading player %s' % player_url
678 if self._downloader.params.get('verbose') else
679 'Downloading %s player %s' % (player_type, player_id)
680 )
681 if player_type == 'js':
682 code = self._download_webpage(
683 player_url, video_id,
684 note=download_note,
685 errnote='Download of %s failed' % player_url)
686 res = self._parse_sig_js(code)
687 elif player_type == 'swf':
688 urlh = self._request_webpage(
689 player_url, video_id,
690 note=download_note,
691 errnote='Download of %s failed' % player_url)
692 code = urlh.read()
693 res = self._parse_sig_swf(code)
694 else:
695 assert False, 'Invalid player type %r' % player_type
696
697 test_string = ''.join(map(compat_chr, range(len(example_sig))))
698 cache_res = res(test_string)
699 cache_spec = [ord(c) for c in cache_res]
700
701 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
702 return res
703
704 def _print_sig_code(self, func, example_sig):
705 def gen_sig_code(idxs):
706 def _genslice(start, end, step):
707 starts = '' if start == 0 else str(start)
708 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
709 steps = '' if step == 1 else (':%d' % step)
710 return 's[%s%s%s]' % (starts, ends, steps)
711
712 step = None
713 # Quelch pyflakes warnings - start will be set when step is set
714 start = '(Never used)'
715 for i, prev in zip(idxs[1:], idxs[:-1]):
716 if step is not None:
717 if i - prev == step:
718 continue
719 yield _genslice(start, prev, step)
720 step = None
721 continue
722 if i - prev in [-1, 1]:
723 step = i - prev
724 start = prev
725 continue
726 else:
727 yield 's[%d]' % prev
728 if step is None:
729 yield 's[%d]' % i
730 else:
731 yield _genslice(start, i, step)
732
733 test_string = ''.join(map(compat_chr, range(len(example_sig))))
734 cache_res = func(test_string)
735 cache_spec = [ord(c) for c in cache_res]
736 expr_code = ' + '.join(gen_sig_code(cache_spec))
737 signature_id_tuple = '(%s)' % (
738 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
739 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
740 ' return %s\n') % (signature_id_tuple, expr_code)
741 self.to_screen('Extracted signature function:\n' + code)
742
743 def _parse_sig_js(self, jscode):
744 funcname = self._search_regex(
745 r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode,
746 'Initial JS player signature function name')
747
748 jsi = JSInterpreter(jscode)
749 initial_function = jsi.extract_function(funcname)
750 return lambda s: initial_function([s])
751
752 def _parse_sig_swf(self, file_contents):
753 swfi = SWFInterpreter(file_contents)
754 TARGET_CLASSNAME = 'SignatureDecipher'
755 searched_class = swfi.extract_class(TARGET_CLASSNAME)
756 initial_function = swfi.extract_function(searched_class, 'decipher')
757 return lambda s: initial_function([s])
758
759 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
760 """Turn the encrypted s field into a working signature"""
761
762 if player_url is None:
763 raise ExtractorError('Cannot decrypt signature without player_url')
764
765 if player_url.startswith('//'):
766 player_url = 'https:' + player_url
767 try:
768 player_id = (player_url, self._signature_cache_id(s))
769 if player_id not in self._player_cache:
770 func = self._extract_signature_function(
771 video_id, player_url, s
772 )
773 self._player_cache[player_id] = func
774 func = self._player_cache[player_id]
775 if self._downloader.params.get('youtube_print_sig_code'):
776 self._print_sig_code(func, s)
777 return func(s)
778 except Exception as e:
779 tb = traceback.format_exc()
780 raise ExtractorError(
781 'Signature extraction failed: ' + tb, cause=e)
782
783 def _get_subtitles(self, video_id, webpage):
784 try:
785 subs_doc = self._download_xml(
786 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
787 video_id, note=False)
788 except ExtractorError as err:
789 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
790 return {}
791
792 sub_lang_list = {}
793 for track in subs_doc.findall('track'):
794 lang = track.attrib['lang_code']
795 if lang in sub_lang_list:
796 continue
797 sub_formats = []
798 for ext in ['sbv', 'vtt', 'srt']:
799 params = compat_urllib_parse.urlencode({
800 'lang': lang,
801 'v': video_id,
802 'fmt': ext,
803 'name': track.attrib['name'].encode('utf-8'),
804 })
805 sub_formats.append({
806 'url': 'https://www.youtube.com/api/timedtext?' + params,
807 'ext': ext,
808 })
809 sub_lang_list[lang] = sub_formats
810 if not sub_lang_list:
811 self._downloader.report_warning('video doesn\'t have subtitles')
812 return {}
813 return sub_lang_list
814
815 def _get_automatic_captions(self, video_id, webpage):
816 """We need the webpage for getting the captions url, pass it as an
817 argument to speed up the process."""
818 self.to_screen('%s: Looking for automatic captions' % video_id)
819 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
820 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
821 if mobj is None:
822 self._downloader.report_warning(err_msg)
823 return {}
824 player_config = json.loads(mobj.group(1))
825 try:
826 args = player_config['args']
827 caption_url = args['ttsurl']
828 timestamp = args['timestamp']
829 # We get the available subtitles
830 list_params = compat_urllib_parse.urlencode({
831 'type': 'list',
832 'tlangs': 1,
833 'asrs': 1,
834 })
835 list_url = caption_url + '&' + list_params
836 caption_list = self._download_xml(list_url, video_id)
837 original_lang_node = caption_list.find('track')
838 if original_lang_node is None:
839 self._downloader.report_warning('Video doesn\'t have automatic captions')
840 return {}
841 original_lang = original_lang_node.attrib['lang_code']
842 caption_kind = original_lang_node.attrib.get('kind', '')
843
844 sub_lang_list = {}
845 for lang_node in caption_list.findall('target'):
846 sub_lang = lang_node.attrib['lang_code']
847 sub_formats = []
848 for ext in ['sbv', 'vtt', 'srt']:
849 params = compat_urllib_parse.urlencode({
850 'lang': original_lang,
851 'tlang': sub_lang,
852 'fmt': ext,
853 'ts': timestamp,
854 'kind': caption_kind,
855 })
856 sub_formats.append({
857 'url': caption_url + '&' + params,
858 'ext': ext,
859 })
860 sub_lang_list[sub_lang] = sub_formats
861 return sub_lang_list
862 # An extractor error can be raise by the download process if there are
863 # no automatic captions but there are subtitles
864 except (KeyError, ExtractorError):
865 self._downloader.report_warning(err_msg)
866 return {}
867
868 @classmethod
869 def extract_id(cls, url):
870 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
871 if mobj is None:
872 raise ExtractorError('Invalid URL: %s' % url)
873 video_id = mobj.group(2)
874 return video_id
875
876 def _extract_from_m3u8(self, manifest_url, video_id):
877 url_map = {}
878
879 def _get_urls(_manifest):
880 lines = _manifest.split('\n')
881 urls = filter(lambda l: l and not l.startswith('#'),
882 lines)
883 return urls
884 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
885 formats_urls = _get_urls(manifest)
886 for format_url in formats_urls:
887 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
888 url_map[itag] = format_url
889 return url_map
890
891 def _extract_annotations(self, video_id):
892 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
893 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
894
895 def _parse_dash_manifest(
896 self, video_id, dash_manifest_url, player_url, age_gate, fatal=True):
897 def decrypt_sig(mobj):
898 s = mobj.group(1)
899 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
900 return '/signature/%s' % dec_s
901 dash_manifest_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, dash_manifest_url)
902 dash_doc = self._download_xml(
903 dash_manifest_url, video_id,
904 note='Downloading DASH manifest',
905 errnote='Could not download DASH manifest',
906 fatal=fatal)
907
908 if dash_doc is False:
909 return []
910
911 formats = []
912 for a in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}AdaptationSet'):
913 mime_type = a.attrib.get('mimeType')
914 for r in a.findall('{urn:mpeg:DASH:schema:MPD:2011}Representation'):
915 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
916 if url_el is None:
917 continue
918 if mime_type == 'text/vtt':
919 # TODO implement WebVTT downloading
920 pass
921 elif mime_type.startswith('audio/') or mime_type.startswith('video/'):
922 segment_list = r.find('{urn:mpeg:DASH:schema:MPD:2011}SegmentList')
923 format_id = r.attrib['id']
924 video_url = url_el.text
925 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
926 f = {
927 'format_id': format_id,
928 'url': video_url,
929 'width': int_or_none(r.attrib.get('width')),
930 'height': int_or_none(r.attrib.get('height')),
931 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
932 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
933 'filesize': filesize,
934 'fps': int_or_none(r.attrib.get('frameRate')),
935 }
936 if segment_list is not None:
937 f.update({
938 'initialization_url': segment_list.find('{urn:mpeg:DASH:schema:MPD:2011}Initialization').attrib['sourceURL'],
939 'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall('{urn:mpeg:DASH:schema:MPD:2011}SegmentURL')],
940 'protocol': 'http_dash_segments',
941 })
942 try:
943 existing_format = next(
944 fo for fo in formats
945 if fo['format_id'] == format_id)
946 except StopIteration:
947 full_info = self._formats.get(format_id, {}).copy()
948 full_info.update(f)
949 codecs = r.attrib.get('codecs')
950 if codecs:
951 if full_info.get('acodec') == 'none' and 'vcodec' not in full_info:
952 full_info['vcodec'] = codecs
953 elif full_info.get('vcodec') == 'none' and 'acodec' not in full_info:
954 full_info['acodec'] = codecs
955 formats.append(full_info)
956 else:
957 existing_format.update(f)
958 else:
959 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
960 return formats
961
962 def _real_extract(self, url):
963 url, smuggled_data = unsmuggle_url(url, {})
964
965 proto = (
966 'http' if self._downloader.params.get('prefer_insecure', False)
967 else 'https')
968
969 start_time = None
970 end_time = None
971 parsed_url = compat_urllib_parse_urlparse(url)
972 for component in [parsed_url.fragment, parsed_url.query]:
973 query = compat_parse_qs(component)
974 if start_time is None and 't' in query:
975 start_time = parse_duration(query['t'][0])
976 if start_time is None and 'start' in query:
977 start_time = parse_duration(query['start'][0])
978 if end_time is None and 'end' in query:
979 end_time = parse_duration(query['end'][0])
980
981 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
982 mobj = re.search(self._NEXT_URL_RE, url)
983 if mobj:
984 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
985 video_id = self.extract_id(url)
986
987 # Get video webpage
988 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
989 video_webpage = self._download_webpage(url, video_id)
990
991 # Attempt to extract SWF player URL
992 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
993 if mobj is not None:
994 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
995 else:
996 player_url = None
997
998 dash_mpds = []
999
1000 def add_dash_mpd(video_info):
1001 dash_mpd = video_info.get('dashmpd')
1002 if dash_mpd and dash_mpd[0] not in dash_mpds:
1003 dash_mpds.append(dash_mpd[0])
1004
1005 # Get video info
1006 embed_webpage = None
1007 is_live = None
1008 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1009 age_gate = True
1010 # We simulate the access to the video from www.youtube.com/v/{video_id}
1011 # this can be viewed without login into Youtube
1012 url = proto + '://www.youtube.com/embed/%s' % video_id
1013 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
1014 data = compat_urllib_parse.urlencode({
1015 'video_id': video_id,
1016 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1017 'sts': self._search_regex(
1018 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1019 })
1020 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1021 video_info_webpage = self._download_webpage(
1022 video_info_url, video_id,
1023 note='Refetching age-gated info webpage',
1024 errnote='unable to download video info webpage')
1025 video_info = compat_parse_qs(video_info_webpage)
1026 add_dash_mpd(video_info)
1027 else:
1028 age_gate = False
1029 video_info = None
1030 # Try looking directly into the video webpage
1031 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
1032 if mobj:
1033 json_code = uppercase_escape(mobj.group(1))
1034 ytplayer_config = json.loads(json_code)
1035 args = ytplayer_config['args']
1036 if args.get('url_encoded_fmt_stream_map'):
1037 # Convert to the same format returned by compat_parse_qs
1038 video_info = dict((k, [v]) for k, v in args.items())
1039 add_dash_mpd(video_info)
1040 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1041 is_live = True
1042 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1043 # We also try looking in get_video_info since it may contain different dashmpd
1044 # URL that points to a DASH manifest with possibly different itag set (some itags
1045 # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
1046 # manifest pointed by get_video_info's dashmpd).
1047 # The general idea is to take a union of itags of both DASH manifests (for example
1048 # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
1049 self.report_video_info_webpage_download(video_id)
1050 for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:
1051 video_info_url = (
1052 '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1053 % (proto, video_id, el_type))
1054 video_info_webpage = self._download_webpage(
1055 video_info_url,
1056 video_id, note=False,
1057 errnote='unable to download video info webpage')
1058 get_video_info = compat_parse_qs(video_info_webpage)
1059 if get_video_info.get('use_cipher_signature') != ['True']:
1060 add_dash_mpd(get_video_info)
1061 if not video_info:
1062 video_info = get_video_info
1063 if 'token' in get_video_info:
1064 break
1065 if 'token' not in video_info:
1066 if 'reason' in video_info:
1067 if 'The uploader has not made this video available in your country.' in video_info['reason']:
1068 regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None)
1069 if regions_allowed:
1070 raise ExtractorError('YouTube said: This video is available in %s only' % (
1071 ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))),
1072 expected=True)
1073 raise ExtractorError(
1074 'YouTube said: %s' % video_info['reason'][0],
1075 expected=True, video_id=video_id)
1076 else:
1077 raise ExtractorError(
1078 '"token" parameter not in video info for unknown reason',
1079 video_id=video_id)
1080
1081 # title
1082 if 'title' in video_info:
1083 video_title = video_info['title'][0]
1084 else:
1085 self._downloader.report_warning('Unable to extract video title')
1086 video_title = '_'
1087
1088 # description
1089 video_description = get_element_by_id("eow-description", video_webpage)
1090 if video_description:
1091 video_description = re.sub(r'''(?x)
1092 <a\s+
1093 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1094 title="([^"]+)"\s+
1095 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1096 class="yt-uix-redirect-link"\s*>
1097 [^<]+
1098 </a>
1099 ''', r'\1', video_description)
1100 video_description = clean_html(video_description)
1101 else:
1102 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1103 if fd_mobj:
1104 video_description = unescapeHTML(fd_mobj.group(1))
1105 else:
1106 video_description = ''
1107
1108 if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False):
1109 if not self._downloader.params.get('noplaylist'):
1110 entries = []
1111 feed_ids = []
1112 multifeed_metadata_list = compat_urllib_parse_unquote_plus(video_info['multifeed_metadata_list'][0])
1113 for feed in multifeed_metadata_list.split(','):
1114 feed_data = compat_parse_qs(feed)
1115 entries.append({
1116 '_type': 'url_transparent',
1117 'ie_key': 'Youtube',
1118 'url': smuggle_url(
1119 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1120 {'force_singlefeed': True}),
1121 'title': '%s (%s)' % (video_title, feed_data['title'][0]),
1122 })
1123 feed_ids.append(feed_data['id'][0])
1124 self.to_screen(
1125 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1126 % (', '.join(feed_ids), video_id))
1127 return self.playlist_result(entries, video_id, video_title, video_description)
1128 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1129
1130 if 'view_count' in video_info:
1131 view_count = int(video_info['view_count'][0])
1132 else:
1133 view_count = None
1134
1135 # Check for "rental" videos
1136 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1137 raise ExtractorError('"rental" videos not supported')
1138
1139 # Start extracting information
1140 self.report_information_extraction(video_id)
1141
1142 # uploader
1143 if 'author' not in video_info:
1144 raise ExtractorError('Unable to extract uploader name')
1145 video_uploader = compat_urllib_parse_unquote_plus(video_info['author'][0])
1146
1147 # uploader_id
1148 video_uploader_id = None
1149 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1150 if mobj is not None:
1151 video_uploader_id = mobj.group(1)
1152 else:
1153 self._downloader.report_warning('unable to extract uploader nickname')
1154
1155 # thumbnail image
1156 # We try first to get a high quality image:
1157 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1158 video_webpage, re.DOTALL)
1159 if m_thumb is not None:
1160 video_thumbnail = m_thumb.group(1)
1161 elif 'thumbnail_url' not in video_info:
1162 self._downloader.report_warning('unable to extract video thumbnail')
1163 video_thumbnail = None
1164 else: # don't panic if we can't find it
1165 video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
1166
1167 # upload date
1168 upload_date = self._html_search_meta(
1169 'datePublished', video_webpage, 'upload date', default=None)
1170 if not upload_date:
1171 upload_date = self._search_regex(
1172 [r'(?s)id="eow-date.*?>(.*?)</span>',
1173 r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'],
1174 video_webpage, 'upload date', default=None)
1175 if upload_date:
1176 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1177 upload_date = unified_strdate(upload_date)
1178
1179 m_cat_container = self._search_regex(
1180 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
1181 video_webpage, 'categories', default=None)
1182 if m_cat_container:
1183 category = self._html_search_regex(
1184 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
1185 default=None)
1186 video_categories = None if category is None else [category]
1187 else:
1188 video_categories = None
1189
1190 video_tags = [
1191 unescapeHTML(m.group('content'))
1192 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
1193
1194 def _extract_count(count_name):
1195 return str_to_int(self._search_regex(
1196 r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
1197 % re.escape(count_name),
1198 video_webpage, count_name, default=None))
1199
1200 like_count = _extract_count('like')
1201 dislike_count = _extract_count('dislike')
1202
1203 # subtitles
1204 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1205 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
1206
1207 if 'length_seconds' not in video_info:
1208 self._downloader.report_warning('unable to extract video duration')
1209 video_duration = None
1210 else:
1211 video_duration = int(compat_urllib_parse_unquote_plus(video_info['length_seconds'][0]))
1212
1213 # annotations
1214 video_annotations = None
1215 if self._downloader.params.get('writeannotations', False):
1216 video_annotations = self._extract_annotations(video_id)
1217
1218 def _map_to_format_list(urlmap):
1219 formats = []
1220 for itag, video_real_url in urlmap.items():
1221 dct = {
1222 'format_id': itag,
1223 'url': video_real_url,
1224 'player_url': player_url,
1225 }
1226 if itag in self._formats:
1227 dct.update(self._formats[itag])
1228 formats.append(dct)
1229 return formats
1230
1231 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1232 self.report_rtmp_download()
1233 formats = [{
1234 'format_id': '_rtmp',
1235 'protocol': 'rtmp',
1236 'url': video_info['conn'][0],
1237 'player_url': player_url,
1238 }]
1239 elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
1240 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
1241 if 'rtmpe%3Dyes' in encoded_url_map:
1242 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1243 formats = []
1244 for url_data_str in encoded_url_map.split(','):
1245 url_data = compat_parse_qs(url_data_str)
1246 if 'itag' not in url_data or 'url' not in url_data:
1247 continue
1248 format_id = url_data['itag'][0]
1249 url = url_data['url'][0]
1250
1251 if 'sig' in url_data:
1252 url += '&signature=' + url_data['sig'][0]
1253 elif 's' in url_data:
1254 encrypted_sig = url_data['s'][0]
1255 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
1256
1257 jsplayer_url_json = self._search_regex(
1258 ASSETS_RE,
1259 embed_webpage if age_gate else video_webpage,
1260 'JS player URL (1)', default=None)
1261 if not jsplayer_url_json and not age_gate:
1262 # We need the embed website after all
1263 if embed_webpage is None:
1264 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1265 embed_webpage = self._download_webpage(
1266 embed_url, video_id, 'Downloading embed webpage')
1267 jsplayer_url_json = self._search_regex(
1268 ASSETS_RE, embed_webpage, 'JS player URL')
1269
1270 player_url = json.loads(jsplayer_url_json)
1271 if player_url is None:
1272 player_url_json = self._search_regex(
1273 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
1274 video_webpage, 'age gate player URL')
1275 player_url = json.loads(player_url_json)
1276
1277 if self._downloader.params.get('verbose'):
1278 if player_url is None:
1279 player_version = 'unknown'
1280 player_desc = 'unknown'
1281 else:
1282 if player_url.endswith('swf'):
1283 player_version = self._search_regex(
1284 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
1285 'flash player', fatal=False)
1286 player_desc = 'flash player %s' % player_version
1287 else:
1288 player_version = self._search_regex(
1289 r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',
1290 player_url,
1291 'html5 player', fatal=False)
1292 player_desc = 'html5 player %s' % player_version
1293
1294 parts_sizes = self._signature_cache_id(encrypted_sig)
1295 self.to_screen('{%s} signature length %s, %s' %
1296 (format_id, parts_sizes, player_desc))
1297
1298 signature = self._decrypt_signature(
1299 encrypted_sig, video_id, player_url, age_gate)
1300 url += '&signature=' + signature
1301 if 'ratebypass' not in url:
1302 url += '&ratebypass=yes'
1303
1304 # Some itags are not included in DASH manifest thus corresponding formats will
1305 # lack metadata (see https://github.com/rg3/youtube-dl/pull/5993).
1306 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
1307 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
1308 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
1309 dct = {
1310 'format_id': format_id,
1311 'url': url,
1312 'player_url': player_url,
1313 'filesize': int_or_none(url_data.get('clen', [None])[0]),
1314 'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000),
1315 'width': width,
1316 'height': height,
1317 'fps': int_or_none(url_data.get('fps', [None])[0]),
1318 'format_note': url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0],
1319 }
1320 type_ = url_data.get('type', [None])[0]
1321 if type_:
1322 type_split = type_.split(';')
1323 kind_ext = type_split[0].split('/')
1324 if len(kind_ext) == 2:
1325 kind, ext = kind_ext
1326 dct['ext'] = ext
1327 if kind in ('audio', 'video'):
1328 codecs = None
1329 for mobj in re.finditer(
1330 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
1331 if mobj.group('key') == 'codecs':
1332 codecs = mobj.group('val')
1333 break
1334 if codecs:
1335 codecs = codecs.split(',')
1336 if len(codecs) == 2:
1337 acodec, vcodec = codecs[0], codecs[1]
1338 else:
1339 acodec, vcodec = (codecs[0], 'none') if kind == 'audio' else ('none', codecs[0])
1340 dct.update({
1341 'acodec': acodec,
1342 'vcodec': vcodec,
1343 })
1344 if format_id in self._formats:
1345 dct.update(self._formats[format_id])
1346 formats.append(dct)
1347 elif video_info.get('hlsvp'):
1348 manifest_url = video_info['hlsvp'][0]
1349 url_map = self._extract_from_m3u8(manifest_url, video_id)
1350 formats = _map_to_format_list(url_map)
1351 else:
1352 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1353
1354 # Look for the DASH manifest
1355 if self._downloader.params.get('youtube_include_dash_manifest', True):
1356 dash_mpd_fatal = True
1357 for dash_manifest_url in dash_mpds:
1358 dash_formats = {}
1359 try:
1360 for df in self._parse_dash_manifest(
1361 video_id, dash_manifest_url, player_url, age_gate, dash_mpd_fatal):
1362 # Do not overwrite DASH format found in some previous DASH manifest
1363 if df['format_id'] not in dash_formats:
1364 dash_formats[df['format_id']] = df
1365 # Additional DASH manifests may end up in HTTP Error 403 therefore
1366 # allow them to fail without bug report message if we already have
1367 # some DASH manifest succeeded. This is temporary workaround to reduce
1368 # burst of bug reports until we figure out the reason and whether it
1369 # can be fixed at all.
1370 dash_mpd_fatal = False
1371 except (ExtractorError, KeyError) as e:
1372 self.report_warning(
1373 'Skipping DASH manifest: %r' % e, video_id)
1374 if dash_formats:
1375 # Remove the formats we found through non-DASH, they
1376 # contain less info and it can be wrong, because we use
1377 # fixed values (for example the resolution). See
1378 # https://github.com/rg3/youtube-dl/issues/5774 for an
1379 # example.
1380 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
1381 formats.extend(dash_formats.values())
1382
1383 # Check for malformed aspect ratio
1384 stretched_m = re.search(
1385 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
1386 video_webpage)
1387 if stretched_m:
1388 ratio = float(stretched_m.group('w')) / float(stretched_m.group('h'))
1389 for f in formats:
1390 if f.get('vcodec') != 'none':
1391 f['stretched_ratio'] = ratio
1392
1393 self._sort_formats(formats)
1394
1395 return {
1396 'id': video_id,
1397 'uploader': video_uploader,
1398 'uploader_id': video_uploader_id,
1399 'upload_date': upload_date,
1400 'title': video_title,
1401 'thumbnail': video_thumbnail,
1402 'description': video_description,
1403 'categories': video_categories,
1404 'tags': video_tags,
1405 'subtitles': video_subtitles,
1406 'automatic_captions': automatic_captions,
1407 'duration': video_duration,
1408 'age_limit': 18 if age_gate else 0,
1409 'annotations': video_annotations,
1410 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1411 'view_count': view_count,
1412 'like_count': like_count,
1413 'dislike_count': dislike_count,
1414 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
1415 'formats': formats,
1416 'is_live': is_live,
1417 'start_time': start_time,
1418 'end_time': end_time,
1419 }
1420
1421
1422 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1423 IE_DESC = 'YouTube.com playlists'
1424 _VALID_URL = r"""(?x)(?:
1425 (?:https?://)?
1426 (?:\w+\.)?
1427 youtube\.com/
1428 (?:
1429 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
1430 \? (?:.*?&)*? (?:p|a|list)=
1431 | p/
1432 )
1433 (
1434 (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,}
1435 # Top tracks, they can also include dots
1436 |(?:MC)[\w\.]*
1437 )
1438 .*
1439 |
1440 ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})
1441 )"""
1442 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1443 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1444 IE_NAME = 'youtube:playlist'
1445 _TESTS = [{
1446 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1447 'info_dict': {
1448 'title': 'ytdl test PL',
1449 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1450 },
1451 'playlist_count': 3,
1452 }, {
1453 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1454 'info_dict': {
1455 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1456 'title': 'YDL_Empty_List',
1457 },
1458 'playlist_count': 0,
1459 }, {
1460 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1461 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1462 'info_dict': {
1463 'title': '29C3: Not my department',
1464 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1465 },
1466 'playlist_count': 95,
1467 }, {
1468 'note': 'issue #673',
1469 'url': 'PLBB231211A4F62143',
1470 'info_dict': {
1471 'title': '[OLD]Team Fortress 2 (Class-based LP)',
1472 'id': 'PLBB231211A4F62143',
1473 },
1474 'playlist_mincount': 26,
1475 }, {
1476 'note': 'Large playlist',
1477 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1478 'info_dict': {
1479 'title': 'Uploads from Cauchemar',
1480 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
1481 },
1482 'playlist_mincount': 799,
1483 }, {
1484 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1485 'info_dict': {
1486 'title': 'YDL_safe_search',
1487 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1488 },
1489 'playlist_count': 2,
1490 }, {
1491 'note': 'embedded',
1492 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1493 'playlist_count': 4,
1494 'info_dict': {
1495 'title': 'JODA15',
1496 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1497 }
1498 }, {
1499 'note': 'Embedded SWF player',
1500 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1501 'playlist_count': 4,
1502 'info_dict': {
1503 'title': 'JODA7',
1504 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
1505 }
1506 }, {
1507 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
1508 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
1509 'info_dict': {
1510 'title': 'Uploads from Interstellar Movie',
1511 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
1512 },
1513 'playlist_mincout': 21,
1514 }]
1515
1516 def _real_initialize(self):
1517 self._login()
1518
1519 def _extract_mix(self, playlist_id):
1520 # The mixes are generated from a single video
1521 # the id of the playlist is just 'RD' + video_id
1522 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1523 webpage = self._download_webpage(
1524 url, playlist_id, 'Downloading Youtube mix')
1525 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1526 title_span = (
1527 search_title('playlist-title') or
1528 search_title('title long-title') or
1529 search_title('title'))
1530 title = clean_html(title_span)
1531 ids = orderedSet(re.findall(
1532 r'''(?xs)data-video-username=".*?".*?
1533 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1534 webpage))
1535 url_results = self._ids_to_results(ids)
1536
1537 return self.playlist_result(url_results, playlist_id, title)
1538
1539 def _extract_playlist(self, playlist_id):
1540 url = self._TEMPLATE_URL % playlist_id
1541 page = self._download_webpage(url, playlist_id)
1542
1543 for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page):
1544 match = match.strip()
1545 # Check if the playlist exists or is private
1546 if re.match(r'[^<]*(The|This) playlist (does not exist|is private)[^<]*', match):
1547 raise ExtractorError(
1548 'The playlist doesn\'t exist or is private, use --username or '
1549 '--netrc to access it.',
1550 expected=True)
1551 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
1552 raise ExtractorError(
1553 'Invalid parameters. Maybe URL is incorrect.',
1554 expected=True)
1555 elif re.match(r'[^<]*Choose your language[^<]*', match):
1556 continue
1557 else:
1558 self.report_warning('Youtube gives an alert message: ' + match)
1559
1560 # Extract the video ids from the playlist pages
1561 def _entries():
1562 more_widget_html = content_html = page
1563 for page_num in itertools.count(1):
1564 matches = re.finditer(self._VIDEO_RE, content_html)
1565 # We remove the duplicates and the link with index 0
1566 # (it's not the first video of the playlist)
1567 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1568 for vid_id in new_ids:
1569 yield self.url_result(vid_id, 'Youtube', video_id=vid_id)
1570
1571 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1572 if not mobj:
1573 break
1574
1575 more = self._download_json(
1576 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1577 'Downloading page #%s' % page_num,
1578 transform_source=uppercase_escape)
1579 content_html = more['content_html']
1580 if not content_html.strip():
1581 # Some webpages show a "Load more" button but they don't
1582 # have more videos
1583 break
1584 more_widget_html = more['load_more_widget_html']
1585
1586 playlist_title = self._html_search_regex(
1587 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1588 page, 'title')
1589
1590 return self.playlist_result(_entries(), playlist_id, playlist_title)
1591
1592 def _real_extract(self, url):
1593 # Extract playlist id
1594 mobj = re.match(self._VALID_URL, url)
1595 if mobj is None:
1596 raise ExtractorError('Invalid URL: %s' % url)
1597 playlist_id = mobj.group(1) or mobj.group(2)
1598
1599 # Check if it's a video-specific URL
1600 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1601 if 'v' in query_dict:
1602 video_id = query_dict['v'][0]
1603 if self._downloader.params.get('noplaylist'):
1604 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1605 return self.url_result(video_id, 'Youtube', video_id=video_id)
1606 else:
1607 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1608
1609 if playlist_id.startswith('RD') or playlist_id.startswith('UL'):
1610 # Mixes require a custom extraction process
1611 return self._extract_mix(playlist_id)
1612
1613 return self._extract_playlist(playlist_id)
1614
1615
1616 class YoutubeChannelIE(InfoExtractor):
1617 IE_DESC = 'YouTube.com channels'
1618 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
1619 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
1620 IE_NAME = 'youtube:channel'
1621 _TESTS = [{
1622 'note': 'paginated channel',
1623 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1624 'playlist_mincount': 91,
1625 'info_dict': {
1626 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
1627 }
1628 }]
1629
1630 @staticmethod
1631 def extract_videos_from_page(page):
1632 ids_in_page = []
1633 titles_in_page = []
1634 for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page):
1635 video_id = mobj.group('id')
1636 video_title = unescapeHTML(mobj.group('title'))
1637 try:
1638 idx = ids_in_page.index(video_id)
1639 if video_title and not titles_in_page[idx]:
1640 titles_in_page[idx] = video_title
1641 except ValueError:
1642 ids_in_page.append(video_id)
1643 titles_in_page.append(video_title)
1644 return zip(ids_in_page, titles_in_page)
1645
1646 def _real_extract(self, url):
1647 channel_id = self._match_id(url)
1648
1649 url = self._TEMPLATE_URL % channel_id
1650
1651 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
1652 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
1653 # otherwise fallback on channel by page extraction
1654 channel_page = self._download_webpage(
1655 url + '?view=57', channel_id,
1656 'Downloading channel page', fatal=False)
1657 if channel_page is False:
1658 channel_playlist_id = False
1659 else:
1660 channel_playlist_id = self._html_search_meta(
1661 'channelId', channel_page, 'channel id', default=None)
1662 if not channel_playlist_id:
1663 channel_playlist_id = self._search_regex(
1664 r'data-channel-external-id="([^"]+)"',
1665 channel_page, 'channel id', default=None)
1666 if channel_playlist_id and channel_playlist_id.startswith('UC'):
1667 playlist_id = 'UU' + channel_playlist_id[2:]
1668 return self.url_result(
1669 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
1670
1671 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
1672 autogenerated = re.search(r'''(?x)
1673 class="[^"]*?(?:
1674 channel-header-autogenerated-label|
1675 yt-channel-title-autogenerated
1676 )[^"]*"''', channel_page) is not None
1677
1678 if autogenerated:
1679 # The videos are contained in a single page
1680 # the ajax pages can't be used, they are empty
1681 entries = [
1682 self.url_result(
1683 video_id, 'Youtube', video_id=video_id,
1684 video_title=video_title)
1685 for video_id, video_title in self.extract_videos_from_page(channel_page)]
1686 return self.playlist_result(entries, channel_id)
1687
1688 def _entries():
1689 more_widget_html = content_html = channel_page
1690 for pagenum in itertools.count(1):
1691
1692 for video_id, video_title in self.extract_videos_from_page(content_html):
1693 yield self.url_result(
1694 video_id, 'Youtube', video_id=video_id,
1695 video_title=video_title)
1696
1697 mobj = re.search(
1698 r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
1699 more_widget_html)
1700 if not mobj:
1701 break
1702
1703 more = self._download_json(
1704 'https://youtube.com/%s' % mobj.group('more'), channel_id,
1705 'Downloading page #%s' % (pagenum + 1),
1706 transform_source=uppercase_escape)
1707 content_html = more['content_html']
1708 more_widget_html = more['load_more_widget_html']
1709
1710 return self.playlist_result(_entries(), channel_id)
1711
1712
1713 class YoutubeUserIE(YoutubeChannelIE):
1714 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
1715 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
1716 _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos'
1717 IE_NAME = 'youtube:user'
1718
1719 _TESTS = [{
1720 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1721 'playlist_mincount': 320,
1722 'info_dict': {
1723 'title': 'TheLinuxFoundation',
1724 }
1725 }, {
1726 'url': 'ytuser:phihag',
1727 'only_matching': True,
1728 }]
1729
1730 @classmethod
1731 def suitable(cls, url):
1732 # Don't return True if the url can be extracted with other youtube
1733 # extractor, the regex would is too permissive and it would match.
1734 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1735 if any(ie.suitable(url) for ie in other_ies):
1736 return False
1737 else:
1738 return super(YoutubeUserIE, cls).suitable(url)
1739
1740
1741 class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
1742 IE_DESC = 'YouTube.com searches'
1743 # there doesn't appear to be a real limit, for example if you search for
1744 # 'python' you get more than 8.000.000 results
1745 _MAX_RESULTS = float('inf')
1746 IE_NAME = 'youtube:search'
1747 _SEARCH_KEY = 'ytsearch'
1748 _EXTRA_QUERY_ARGS = {}
1749 _TESTS = []
1750
1751 def _get_n_results(self, query, n):
1752 """Get a specified number of results for a query"""
1753
1754 videos = []
1755 limit = n
1756
1757 for pagenum in itertools.count(1):
1758 url_query = {
1759 'search_query': query.encode('utf-8'),
1760 'page': pagenum,
1761 'spf': 'navigate',
1762 }
1763 url_query.update(self._EXTRA_QUERY_ARGS)
1764 result_url = 'https://www.youtube.com/results?' + compat_urllib_parse.urlencode(url_query)
1765 data = self._download_json(
1766 result_url, video_id='query "%s"' % query,
1767 note='Downloading page %s' % pagenum,
1768 errnote='Unable to download API page')
1769 html_content = data[1]['body']['content']
1770
1771 if 'class="search-message' in html_content:
1772 raise ExtractorError(
1773 '[youtube] No video results', expected=True)
1774
1775 new_videos = self._ids_to_results(orderedSet(re.findall(
1776 r'href="/watch\?v=(.{11})', html_content)))
1777 videos += new_videos
1778 if not new_videos or len(videos) > limit:
1779 break
1780
1781 if len(videos) > n:
1782 videos = videos[:n]
1783 return self.playlist_result(videos, query)
1784
1785
1786 class YoutubeSearchDateIE(YoutubeSearchIE):
1787 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1788 _SEARCH_KEY = 'ytsearchdate'
1789 IE_DESC = 'YouTube.com searches, newest videos first'
1790 _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
1791
1792
1793 class YoutubeSearchURLIE(InfoExtractor):
1794 IE_DESC = 'YouTube.com search URLs'
1795 IE_NAME = 'youtube:search_url'
1796 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1797 _TESTS = [{
1798 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1799 'playlist_mincount': 5,
1800 'info_dict': {
1801 'title': 'youtube-dl test video',
1802 }
1803 }]
1804
1805 def _real_extract(self, url):
1806 mobj = re.match(self._VALID_URL, url)
1807 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
1808
1809 webpage = self._download_webpage(url, query)
1810 result_code = self._search_regex(
1811 r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML')
1812
1813 part_codes = re.findall(
1814 r'(?s)<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*>(.*?)</h3>', result_code)
1815 entries = []
1816 for part_code in part_codes:
1817 part_title = self._html_search_regex(
1818 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1819 part_url_snippet = self._html_search_regex(
1820 r'(?s)href="([^"]+)"', part_code, 'item URL')
1821 part_url = compat_urlparse.urljoin(
1822 'https://www.youtube.com/', part_url_snippet)
1823 entries.append({
1824 '_type': 'url',
1825 'url': part_url,
1826 'title': part_title,
1827 })
1828
1829 return {
1830 '_type': 'playlist',
1831 'entries': entries,
1832 'title': query,
1833 }
1834
1835
1836 class YoutubeShowIE(InfoExtractor):
1837 IE_DESC = 'YouTube.com (multi-season) shows'
1838 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
1839 IE_NAME = 'youtube:show'
1840 _TESTS = [{
1841 'url': 'https://www.youtube.com/show/airdisasters',
1842 'playlist_mincount': 5,
1843 'info_dict': {
1844 'id': 'airdisasters',
1845 'title': 'Air Disasters',
1846 }
1847 }]
1848
1849 def _real_extract(self, url):
1850 mobj = re.match(self._VALID_URL, url)
1851 playlist_id = mobj.group('id')
1852 webpage = self._download_webpage(
1853 'https://www.youtube.com/show/%s/playlists' % playlist_id, playlist_id, 'Downloading show webpage')
1854 # There's one playlist for each season of the show
1855 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1856 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1857 entries = [
1858 self.url_result(
1859 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1860 for season in m_seasons
1861 ]
1862 title = self._og_search_title(webpage, fatal=False)
1863
1864 return {
1865 '_type': 'playlist',
1866 'id': playlist_id,
1867 'title': title,
1868 'entries': entries,
1869 }
1870
1871
1872 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1873 """
1874 Base class for feed extractors
1875 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1876 """
1877 _LOGIN_REQUIRED = True
1878
1879 @property
1880 def IE_NAME(self):
1881 return 'youtube:%s' % self._FEED_NAME
1882
1883 def _real_initialize(self):
1884 self._login()
1885
1886 def _real_extract(self, url):
1887 page = self._download_webpage(
1888 'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)
1889
1890 # The extraction process is the same as for playlists, but the regex
1891 # for the video ids doesn't contain an index
1892 ids = []
1893 more_widget_html = content_html = page
1894 for page_num in itertools.count(1):
1895 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1896
1897 # 'recommended' feed has infinite 'load more' and each new portion spins
1898 # the same videos in (sometimes) slightly different order, so we'll check
1899 # for unicity and break when portion has no new videos
1900 new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))
1901 if not new_ids:
1902 break
1903
1904 ids.extend(new_ids)
1905
1906 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1907 if not mobj:
1908 break
1909
1910 more = self._download_json(
1911 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
1912 'Downloading page #%s' % page_num,
1913 transform_source=uppercase_escape)
1914 content_html = more['content_html']
1915 more_widget_html = more['load_more_widget_html']
1916
1917 return self.playlist_result(
1918 self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE)
1919
1920
1921 class YoutubeWatchLaterIE(YoutubePlaylistIE):
1922 IE_NAME = 'youtube:watchlater'
1923 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
1924 _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'
1925
1926 _TESTS = [] # override PlaylistIE tests
1927
1928 def _real_extract(self, url):
1929 return self._extract_playlist('WL')
1930
1931
1932 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1933 IE_NAME = 'youtube:favorites'
1934 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
1935 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1936 _LOGIN_REQUIRED = True
1937
1938 def _real_extract(self, url):
1939 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1940 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
1941 return self.url_result(playlist_id, 'YoutubePlaylist')
1942
1943
1944 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1945 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
1946 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1947 _FEED_NAME = 'recommended'
1948 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1949
1950
1951 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1952 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1953 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1954 _FEED_NAME = 'subscriptions'
1955 _PLAYLIST_TITLE = 'Youtube Subscriptions'
1956
1957
1958 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1959 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
1960 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1961 _FEED_NAME = 'history'
1962 _PLAYLIST_TITLE = 'Youtube History'
1963
1964
1965 class YoutubeTruncatedURLIE(InfoExtractor):
1966 IE_NAME = 'youtube:truncated_url'
1967 IE_DESC = False # Do not list
1968 _VALID_URL = r'''(?x)
1969 (?:https?://)?
1970 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
1971 (?:watch\?(?:
1972 feature=[a-z_]+|
1973 annotation_id=annotation_[^&]+|
1974 x-yt-cl=[0-9]+|
1975 hl=[^&]*|
1976 t=[0-9]+
1977 )?
1978 |
1979 attribution_link\?a=[^&]+
1980 )
1981 $
1982 '''
1983
1984 _TESTS = [{
1985 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1986 'only_matching': True,
1987 }, {
1988 'url': 'http://www.youtube.com/watch?',
1989 'only_matching': True,
1990 }, {
1991 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
1992 'only_matching': True,
1993 }, {
1994 'url': 'https://www.youtube.com/watch?feature=foo',
1995 'only_matching': True,
1996 }, {
1997 'url': 'https://www.youtube.com/watch?hl=en-GB',
1998 'only_matching': True,
1999 }, {
2000 'url': 'https://www.youtube.com/watch?t=2372',
2001 'only_matching': True,
2002 }]
2003
2004 def _real_extract(self, url):
2005 raise ExtractorError(
2006 'Did you forget to quote the URL? Remember that & is a meta '
2007 'character in most shells, so you want to put the URL in quotes, '
2008 'like youtube-dl '
2009 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
2010 ' or simply youtube-dl BaW_jenozKc .',
2011 expected=True)
2012
2013
2014 class YoutubeTruncatedIDIE(InfoExtractor):
2015 IE_NAME = 'youtube:truncated_id'
2016 IE_DESC = False # Do not list
2017 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
2018
2019 _TESTS = [{
2020 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
2021 'only_matching': True,
2022 }]
2023
2024 def _real_extract(self, url):
2025 video_id = self._match_id(url)
2026 raise ExtractorError(
2027 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
2028 expected=True)