]> jfr.im git - yt-dlp.git/blame_incremental - youtube_dl/extractor/youtube.py
[/__init__] Add another cute search example
[yt-dlp.git] / youtube_dl / extractor / youtube.py
... / ...
CommitLineData
1# coding: utf-8
2
3from __future__ import unicode_literals
4
5
6import itertools
7import json
8import os.path
9import re
10import time
11import traceback
12
13from .common import InfoExtractor, SearchInfoExtractor
14from .subtitles import SubtitlesInfoExtractor
15from ..jsinterp import JSInterpreter
16from ..swfinterp import SWFInterpreter
17from ..compat import (
18 compat_chr,
19 compat_parse_qs,
20 compat_urllib_parse,
21 compat_urllib_request,
22 compat_urlparse,
23 compat_str,
24)
25from ..utils import (
26 clean_html,
27 ExtractorError,
28 get_element_by_attribute,
29 get_element_by_id,
30 int_or_none,
31 OnDemandPagedList,
32 orderedSet,
33 unescapeHTML,
34 unified_strdate,
35 uppercase_escape,
36)
37
38
39class YoutubeBaseInfoExtractor(InfoExtractor):
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
43 _NETRC_MACHINE = 'youtube'
44 # If True it will raise an error if no login info is provided
45 _LOGIN_REQUIRED = False
46
47 def _set_language(self):
48 self._set_cookie(
49 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
50 # YouTube sets the expire time to about two months
51 expire_time=time.time() + 2 * 30 * 24 * 3600)
52
53 def _login(self):
54 """
55 Attempt to log in to YouTube.
56 True is returned if successful or skipped.
57 False is returned if login failed.
58
59 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
60 """
61 (username, password) = self._get_login_info()
62 # No authentication to be performed
63 if username is None:
64 if self._LOGIN_REQUIRED:
65 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
66 return True
67
68 login_page = self._download_webpage(
69 self._LOGIN_URL, None,
70 note='Downloading login page',
71 errnote='unable to fetch login page', fatal=False)
72 if login_page is False:
73 return
74
75 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
76 login_page, 'Login GALX parameter')
77
78 # Log in
79 login_form_strs = {
80 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
81 'Email': username,
82 'GALX': galx,
83 'Passwd': password,
84
85 'PersistentCookie': 'yes',
86 '_utf8': '霱',
87 'bgresponse': 'js_disabled',
88 'checkConnection': '',
89 'checkedDomains': 'youtube',
90 'dnConn': '',
91 'pstMsg': '0',
92 'rmShown': '1',
93 'secTok': '',
94 'signIn': 'Sign in',
95 'timeStmp': '',
96 'service': 'youtube',
97 'uilel': '3',
98 'hl': 'en_US',
99 }
100
101 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
102 # chokes on unicode
103 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
104 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
105
106 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
107 login_results = self._download_webpage(
108 req, None,
109 note='Logging in', errnote='unable to log in', fatal=False)
110 if login_results is False:
111 return False
112
113 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
114 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
115
116 # Two-Factor
117 # TODO add SMS and phone call support - these require making a request and then prompting the user
118
119 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
120 tfa_code = self._get_tfa_info()
121
122 if tfa_code is None:
123 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
124 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
125 return False
126
127 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
128
129 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
130 if match is None:
131 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
132 secTok = match.group(1)
133 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
134 if match is None:
135 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
136 timeStmp = match.group(1)
137
138 tfa_form_strs = {
139 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
140 'smsToken': '',
141 'smsUserPin': tfa_code,
142 'smsVerifyPin': 'Verify',
143
144 'PersistentCookie': 'yes',
145 'checkConnection': '',
146 'checkedDomains': 'youtube',
147 'pstMsg': '1',
148 'secTok': secTok,
149 'timeStmp': timeStmp,
150 'service': 'youtube',
151 'hl': 'en_US',
152 }
153 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
154 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
155
156 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
157 tfa_results = self._download_webpage(
158 tfa_req, None,
159 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
160
161 if tfa_results is False:
162 return False
163
164 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
165 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
166 return False
167 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
168 self._downloader.report_warning('unable to log in - did the page structure change?')
169 return False
170 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
171 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
172 return False
173
174 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
175 self._downloader.report_warning('unable to log in: bad username or password')
176 return False
177 return True
178
179 def _real_initialize(self):
180 if self._downloader is None:
181 return
182 self._set_language()
183 if not self._login():
184 return
185
186
187class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
188 IE_DESC = 'YouTube.com'
189 _VALID_URL = r"""(?x)^
190 (
191 (?:https?://|//) # http(s):// or protocol-independent URL
192 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
193 (?:www\.)?deturl\.com/www\.youtube\.com/|
194 (?:www\.)?pwnyoutube\.com/|
195 (?:www\.)?yourepeat\.com/|
196 tube\.majestyc\.net/|
197 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
198 (?:.*?\#/)? # handle anchor (#/) redirect urls
199 (?: # the various things that can precede the ID:
200 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
201 |(?: # or the v= param in all its forms
202 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
203 (?:\?|\#!?) # the params delimiter ? or # or #!
204 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
205 v=
206 )
207 ))
208 |youtu\.be/ # just youtu.be/xxxx
209 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
210 )
211 )? # all until now is optional -> you can pass the naked ID
212 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
213 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
214 (?(1).+)? # if we found the ID, everything can follow
215 $"""
216 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
217 _formats = {
218 '5': {'ext': 'flv', 'width': 400, 'height': 240},
219 '6': {'ext': 'flv', 'width': 450, 'height': 270},
220 '13': {'ext': '3gp'},
221 '17': {'ext': '3gp', 'width': 176, 'height': 144},
222 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
223 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
224 '34': {'ext': 'flv', 'width': 640, 'height': 360},
225 '35': {'ext': 'flv', 'width': 854, 'height': 480},
226 '36': {'ext': '3gp', 'width': 320, 'height': 240},
227 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
228 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
229 '43': {'ext': 'webm', 'width': 640, 'height': 360},
230 '44': {'ext': 'webm', 'width': 854, 'height': 480},
231 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
232 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
233
234
235 # 3d videos
236 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
237 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
238 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
239 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
240 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
241 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
242 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
243
244 # Apple HTTP Live Streaming
245 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
246 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
247 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
248 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
249 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
250 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
251 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
252
253 # DASH mp4 video
254 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
255 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
256 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
257 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
258 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
259 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
260 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
261 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
262 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
263 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
264 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
265
266 # Dash mp4 audio
267 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
268 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
269 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
270
271 # Dash webm
272 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
273 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
274 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
275 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
276 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
277 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
278 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
279 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
280 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
281 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
282 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
283 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
284 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
285 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
286 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
287 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
288 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
289 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
290 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
291
292 # Dash webm audio
293 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
294 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
295
296 # Dash webm audio with opus inside
297 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
298 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
299 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
300
301 # RTMP (unnamed)
302 '_rtmp': {'protocol': 'rtmp'},
303 }
304
305 IE_NAME = 'youtube'
306 _TESTS = [
307 {
308 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
309 'info_dict': {
310 'id': 'BaW_jenozKc',
311 'ext': 'mp4',
312 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
313 'uploader': 'Philipp Hagemeister',
314 'uploader_id': 'phihag',
315 'upload_date': '20121002',
316 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
317 'categories': ['Science & Technology'],
318 'like_count': int,
319 'dislike_count': int,
320 }
321 },
322 {
323 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
324 'note': 'Test generic use_cipher_signature video (#897)',
325 'info_dict': {
326 'id': 'UxxajLWwzqY',
327 'ext': 'mp4',
328 'upload_date': '20120506',
329 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
330 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
331 'uploader': 'Icona Pop',
332 'uploader_id': 'IconaPop',
333 }
334 },
335 {
336 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
337 'note': 'Test VEVO video with age protection (#956)',
338 'info_dict': {
339 'id': '07FYdnEawAQ',
340 'ext': 'mp4',
341 'upload_date': '20130703',
342 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
343 'description': 'md5:64249768eec3bc4276236606ea996373',
344 'uploader': 'justintimberlakeVEVO',
345 'uploader_id': 'justintimberlakeVEVO',
346 }
347 },
348 {
349 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
350 'note': 'Embed-only video (#1746)',
351 'info_dict': {
352 'id': 'yZIXLfi8CZQ',
353 'ext': 'mp4',
354 'upload_date': '20120608',
355 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
356 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
357 'uploader': 'SET India',
358 'uploader_id': 'setindia'
359 }
360 },
361 {
362 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
363 'note': '256k DASH audio (format 141) via DASH manifest',
364 'info_dict': {
365 'id': 'a9LDPn-MO4I',
366 'ext': 'm4a',
367 'upload_date': '20121002',
368 'uploader_id': '8KVIDEO',
369 'description': '',
370 'uploader': '8KVIDEO',
371 'title': 'UHDTV TEST 8K VIDEO.mp4'
372 },
373 'params': {
374 'youtube_include_dash_manifest': True,
375 'format': '141',
376 },
377 },
378 # DASH manifest with encrypted signature
379 {
380 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
381 'info_dict': {
382 'id': 'IB3lcPjvWLA',
383 'ext': 'm4a',
384 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
385 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
386 'uploader': 'AfrojackVEVO',
387 'uploader_id': 'AfrojackVEVO',
388 'upload_date': '20131011',
389 },
390 'params': {
391 'youtube_include_dash_manifest': True,
392 'format': '141',
393 },
394 },
395 # Controversy video
396 {
397 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
398 'info_dict': {
399 'id': 'T4XJQO3qol8',
400 'ext': 'mp4',
401 'upload_date': '20100909',
402 'uploader': 'The Amazing Atheist',
403 'uploader_id': 'TheAmazingAtheist',
404 'title': 'Burning Everyone\'s Koran',
405 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
406 }
407 },
408 # Normal age-gate video (No vevo, embed allowed)
409 {
410 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
411 'info_dict': {
412 'id': 'HtVdAasjOgU',
413 'ext': 'mp4',
414 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
415 'description': 'md5:eca57043abae25130f58f655ad9a7771',
416 'uploader': 'The Witcher',
417 'uploader_id': 'WitcherGame',
418 'upload_date': '20140605',
419 },
420 },
421 # Age-gate video with encrypted signature
422 {
423 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',
424 'info_dict': {
425 'id': '6kLq3WMV1nU',
426 'ext': 'mp4',
427 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
428 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
429 'uploader': 'LloydVEVO',
430 'uploader_id': 'LloydVEVO',
431 'upload_date': '20110629',
432 },
433 },
434 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
435 {
436 'url': '__2ABJjxzNo',
437 'info_dict': {
438 'id': '__2ABJjxzNo',
439 'ext': 'mp4',
440 'upload_date': '20100430',
441 'uploader_id': 'deadmau5',
442 'description': 'md5:12c56784b8032162bb936a5f76d55360',
443 'uploader': 'deadmau5',
444 'title': 'Deadmau5 - Some Chords (HD)',
445 },
446 'expected_warnings': [
447 'DASH manifest missing',
448 ]
449 },
450 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
451 {
452 'url': 'lqQg6PlCWgI',
453 'info_dict': {
454 'id': 'lqQg6PlCWgI',
455 'ext': 'mp4',
456 'upload_date': '20120731',
457 'uploader_id': 'olympic',
458 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
459 'uploader': 'Olympics',
460 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
461 },
462 'params': {
463 'skip_download': 'requires avconv',
464 }
465 },
466 ]
467
468 def __init__(self, *args, **kwargs):
469 super(YoutubeIE, self).__init__(*args, **kwargs)
470 self._player_cache = {}
471
472 def report_video_info_webpage_download(self, video_id):
473 """Report attempt to download video info webpage."""
474 self.to_screen('%s: Downloading video info webpage' % video_id)
475
476 def report_information_extraction(self, video_id):
477 """Report attempt to extract video information."""
478 self.to_screen('%s: Extracting video information' % video_id)
479
480 def report_unavailable_format(self, video_id, format):
481 """Report extracted video URL."""
482 self.to_screen('%s: Format %s not available' % (video_id, format))
483
484 def report_rtmp_download(self):
485 """Indicate the download will use the RTMP protocol."""
486 self.to_screen('RTMP download detected')
487
488 def _signature_cache_id(self, example_sig):
489 """ Return a string representation of a signature """
490 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
491
492 def _extract_signature_function(self, video_id, player_url, example_sig):
493 id_m = re.match(
494 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
495 player_url)
496 if not id_m:
497 raise ExtractorError('Cannot identify player %r' % player_url)
498 player_type = id_m.group('ext')
499 player_id = id_m.group('id')
500
501 # Read from filesystem cache
502 func_id = '%s_%s_%s' % (
503 player_type, player_id, self._signature_cache_id(example_sig))
504 assert os.path.basename(func_id) == func_id
505
506 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
507 if cache_spec is not None:
508 return lambda s: ''.join(s[i] for i in cache_spec)
509
510 if player_type == 'js':
511 code = self._download_webpage(
512 player_url, video_id,
513 note='Downloading %s player %s' % (player_type, player_id),
514 errnote='Download of %s failed' % player_url)
515 res = self._parse_sig_js(code)
516 elif player_type == 'swf':
517 urlh = self._request_webpage(
518 player_url, video_id,
519 note='Downloading %s player %s' % (player_type, player_id),
520 errnote='Download of %s failed' % player_url)
521 code = urlh.read()
522 res = self._parse_sig_swf(code)
523 else:
524 assert False, 'Invalid player type %r' % player_type
525
526 if cache_spec is None:
527 test_string = ''.join(map(compat_chr, range(len(example_sig))))
528 cache_res = res(test_string)
529 cache_spec = [ord(c) for c in cache_res]
530
531 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
532 return res
533
534 def _print_sig_code(self, func, example_sig):
535 def gen_sig_code(idxs):
536 def _genslice(start, end, step):
537 starts = '' if start == 0 else str(start)
538 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
539 steps = '' if step == 1 else (':%d' % step)
540 return 's[%s%s%s]' % (starts, ends, steps)
541
542 step = None
543 # Quelch pyflakes warnings - start will be set when step is set
544 start = '(Never used)'
545 for i, prev in zip(idxs[1:], idxs[:-1]):
546 if step is not None:
547 if i - prev == step:
548 continue
549 yield _genslice(start, prev, step)
550 step = None
551 continue
552 if i - prev in [-1, 1]:
553 step = i - prev
554 start = prev
555 continue
556 else:
557 yield 's[%d]' % prev
558 if step is None:
559 yield 's[%d]' % i
560 else:
561 yield _genslice(start, i, step)
562
563 test_string = ''.join(map(compat_chr, range(len(example_sig))))
564 cache_res = func(test_string)
565 cache_spec = [ord(c) for c in cache_res]
566 expr_code = ' + '.join(gen_sig_code(cache_spec))
567 signature_id_tuple = '(%s)' % (
568 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
569 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
570 ' return %s\n') % (signature_id_tuple, expr_code)
571 self.to_screen('Extracted signature function:\n' + code)
572
573 def _parse_sig_js(self, jscode):
574 funcname = self._search_regex(
575 r'\.sig\|\|([a-zA-Z0-9]+)\(', jscode,
576 'Initial JS player signature function name')
577
578 jsi = JSInterpreter(jscode)
579 initial_function = jsi.extract_function(funcname)
580 return lambda s: initial_function([s])
581
582 def _parse_sig_swf(self, file_contents):
583 swfi = SWFInterpreter(file_contents)
584 TARGET_CLASSNAME = 'SignatureDecipher'
585 searched_class = swfi.extract_class(TARGET_CLASSNAME)
586 initial_function = swfi.extract_function(searched_class, 'decipher')
587 return lambda s: initial_function([s])
588
589 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
590 """Turn the encrypted s field into a working signature"""
591
592 if player_url is None:
593 raise ExtractorError('Cannot decrypt signature without player_url')
594
595 if player_url.startswith('//'):
596 player_url = 'https:' + player_url
597 try:
598 player_id = (player_url, self._signature_cache_id(s))
599 if player_id not in self._player_cache:
600 func = self._extract_signature_function(
601 video_id, player_url, s
602 )
603 self._player_cache[player_id] = func
604 func = self._player_cache[player_id]
605 if self._downloader.params.get('youtube_print_sig_code'):
606 self._print_sig_code(func, s)
607 return func(s)
608 except Exception as e:
609 tb = traceback.format_exc()
610 raise ExtractorError(
611 'Signature extraction failed: ' + tb, cause=e)
612
613 def _get_available_subtitles(self, video_id, webpage):
614 try:
615 subs_doc = self._download_xml(
616 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
617 video_id, note=False)
618 except ExtractorError as err:
619 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
620 return {}
621
622 sub_lang_list = {}
623 for track in subs_doc.findall('track'):
624 lang = track.attrib['lang_code']
625 if lang in sub_lang_list:
626 continue
627 params = compat_urllib_parse.urlencode({
628 'lang': lang,
629 'v': video_id,
630 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
631 'name': track.attrib['name'].encode('utf-8'),
632 })
633 url = 'https://www.youtube.com/api/timedtext?' + params
634 sub_lang_list[lang] = url
635 if not sub_lang_list:
636 self._downloader.report_warning('video doesn\'t have subtitles')
637 return {}
638 return sub_lang_list
639
640 def _get_available_automatic_caption(self, video_id, webpage):
641 """We need the webpage for getting the captions url, pass it as an
642 argument to speed up the process."""
643 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
644 self.to_screen('%s: Looking for automatic captions' % video_id)
645 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
646 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
647 if mobj is None:
648 self._downloader.report_warning(err_msg)
649 return {}
650 player_config = json.loads(mobj.group(1))
651 try:
652 args = player_config['args']
653 caption_url = args['ttsurl']
654 timestamp = args['timestamp']
655 # We get the available subtitles
656 list_params = compat_urllib_parse.urlencode({
657 'type': 'list',
658 'tlangs': 1,
659 'asrs': 1,
660 })
661 list_url = caption_url + '&' + list_params
662 caption_list = self._download_xml(list_url, video_id)
663 original_lang_node = caption_list.find('track')
664 if original_lang_node is None:
665 self._downloader.report_warning('Video doesn\'t have automatic captions')
666 return {}
667 original_lang = original_lang_node.attrib['lang_code']
668 caption_kind = original_lang_node.attrib.get('kind', '')
669
670 sub_lang_list = {}
671 for lang_node in caption_list.findall('target'):
672 sub_lang = lang_node.attrib['lang_code']
673 params = compat_urllib_parse.urlencode({
674 'lang': original_lang,
675 'tlang': sub_lang,
676 'fmt': sub_format,
677 'ts': timestamp,
678 'kind': caption_kind,
679 })
680 sub_lang_list[sub_lang] = caption_url + '&' + params
681 return sub_lang_list
682 # An extractor error can be raise by the download process if there are
683 # no automatic captions but there are subtitles
684 except (KeyError, ExtractorError):
685 self._downloader.report_warning(err_msg)
686 return {}
687
688 @classmethod
689 def extract_id(cls, url):
690 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
691 if mobj is None:
692 raise ExtractorError('Invalid URL: %s' % url)
693 video_id = mobj.group(2)
694 return video_id
695
696 def _extract_from_m3u8(self, manifest_url, video_id):
697 url_map = {}
698
699 def _get_urls(_manifest):
700 lines = _manifest.split('\n')
701 urls = filter(lambda l: l and not l.startswith('#'),
702 lines)
703 return urls
704 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
705 formats_urls = _get_urls(manifest)
706 for format_url in formats_urls:
707 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
708 url_map[itag] = format_url
709 return url_map
710
711 def _extract_annotations(self, video_id):
712 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
713 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
714
715 def _parse_dash_manifest(
716 self, video_id, dash_manifest_url, player_url, age_gate):
717 def decrypt_sig(mobj):
718 s = mobj.group(1)
719 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
720 return '/signature/%s' % dec_s
721 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
722 dash_doc = self._download_xml(
723 dash_manifest_url, video_id,
724 note='Downloading DASH manifest',
725 errnote='Could not download DASH manifest')
726
727 formats = []
728 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
729 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
730 if url_el is None:
731 continue
732 format_id = r.attrib['id']
733 video_url = url_el.text
734 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
735 f = {
736 'format_id': format_id,
737 'url': video_url,
738 'width': int_or_none(r.attrib.get('width')),
739 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
740 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
741 'filesize': filesize,
742 'fps': int_or_none(r.attrib.get('frameRate')),
743 }
744 try:
745 existing_format = next(
746 fo for fo in formats
747 if fo['format_id'] == format_id)
748 except StopIteration:
749 f.update(self._formats.get(format_id, {}))
750 formats.append(f)
751 else:
752 existing_format.update(f)
753 return formats
754
755 def _real_extract(self, url):
756 proto = (
757 'http' if self._downloader.params.get('prefer_insecure', False)
758 else 'https')
759
760 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
761 mobj = re.search(self._NEXT_URL_RE, url)
762 if mobj:
763 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
764 video_id = self.extract_id(url)
765
766 # Get video webpage
767 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
768 video_webpage = self._download_webpage(url, video_id)
769
770 # Attempt to extract SWF player URL
771 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
772 if mobj is not None:
773 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
774 else:
775 player_url = None
776
777 # Get video info
778 if re.search(r'player-age-gate-content">', video_webpage) is not None:
779 age_gate = True
780 # We simulate the access to the video from www.youtube.com/v/{video_id}
781 # this can be viewed without login into Youtube
782 url = proto + '://www.youtube.com/embed/%s' % video_id
783 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
784 data = compat_urllib_parse.urlencode({
785 'video_id': video_id,
786 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
787 'sts': self._search_regex(
788 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
789 })
790 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
791 video_info_webpage = self._download_webpage(
792 video_info_url, video_id,
793 note='Refetching age-gated info webpage',
794 errnote='unable to download video info webpage')
795 video_info = compat_parse_qs(video_info_webpage)
796 else:
797 age_gate = False
798 try:
799 # Try looking directly into the video webpage
800 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
801 if not mobj:
802 raise ValueError('Could not find ytplayer.config') # caught below
803 json_code = uppercase_escape(mobj.group(1))
804 ytplayer_config = json.loads(json_code)
805 args = ytplayer_config['args']
806 # Convert to the same format returned by compat_parse_qs
807 video_info = dict((k, [v]) for k, v in args.items())
808 if 'url_encoded_fmt_stream_map' not in args:
809 raise ValueError('No stream_map present') # caught below
810 except ValueError:
811 # We fallback to the get_video_info pages (used by the embed page)
812 self.report_video_info_webpage_download(video_id)
813 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
814 video_info_url = (
815 '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
816 % (proto, video_id, el_type))
817 video_info_webpage = self._download_webpage(
818 video_info_url,
819 video_id, note=False,
820 errnote='unable to download video info webpage')
821 video_info = compat_parse_qs(video_info_webpage)
822 if 'token' in video_info:
823 break
824 if 'token' not in video_info:
825 if 'reason' in video_info:
826 raise ExtractorError(
827 'YouTube said: %s' % video_info['reason'][0],
828 expected=True, video_id=video_id)
829 else:
830 raise ExtractorError(
831 '"token" parameter not in video info for unknown reason',
832 video_id=video_id)
833
834 if 'view_count' in video_info:
835 view_count = int(video_info['view_count'][0])
836 else:
837 view_count = None
838
839 # Check for "rental" videos
840 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
841 raise ExtractorError('"rental" videos not supported')
842
843 # Start extracting information
844 self.report_information_extraction(video_id)
845
846 # uploader
847 if 'author' not in video_info:
848 raise ExtractorError('Unable to extract uploader name')
849 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
850
851 # uploader_id
852 video_uploader_id = None
853 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
854 if mobj is not None:
855 video_uploader_id = mobj.group(1)
856 else:
857 self._downloader.report_warning('unable to extract uploader nickname')
858
859 # title
860 if 'title' in video_info:
861 video_title = video_info['title'][0]
862 else:
863 self._downloader.report_warning('Unable to extract video title')
864 video_title = '_'
865
866 # thumbnail image
867 # We try first to get a high quality image:
868 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
869 video_webpage, re.DOTALL)
870 if m_thumb is not None:
871 video_thumbnail = m_thumb.group(1)
872 elif 'thumbnail_url' not in video_info:
873 self._downloader.report_warning('unable to extract video thumbnail')
874 video_thumbnail = None
875 else: # don't panic if we can't find it
876 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
877
878 # upload date
879 upload_date = None
880 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
881 if mobj is None:
882 mobj = re.search(
883 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
884 video_webpage)
885 if mobj is not None:
886 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
887 upload_date = unified_strdate(upload_date)
888
889 m_cat_container = self._search_regex(
890 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
891 video_webpage, 'categories', default=None)
892 if m_cat_container:
893 category = self._html_search_regex(
894 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
895 default=None)
896 video_categories = None if category is None else [category]
897 else:
898 video_categories = None
899
900 # description
901 video_description = get_element_by_id("eow-description", video_webpage)
902 if video_description:
903 video_description = re.sub(r'''(?x)
904 <a\s+
905 (?:[a-zA-Z-]+="[^"]+"\s+)*?
906 title="([^"]+)"\s+
907 (?:[a-zA-Z-]+="[^"]+"\s+)*?
908 class="yt-uix-redirect-link"\s*>
909 [^<]+
910 </a>
911 ''', r'\1', video_description)
912 video_description = clean_html(video_description)
913 else:
914 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
915 if fd_mobj:
916 video_description = unescapeHTML(fd_mobj.group(1))
917 else:
918 video_description = ''
919
920 def _extract_count(count_name):
921 count = self._search_regex(
922 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
923 video_webpage, count_name, default=None)
924 if count is not None:
925 return int(count.replace(',', ''))
926 return None
927 like_count = _extract_count('like')
928 dislike_count = _extract_count('dislike')
929
930 # subtitles
931 video_subtitles = self.extract_subtitles(video_id, video_webpage)
932
933 if self._downloader.params.get('listsubtitles', False):
934 self._list_available_subtitles(video_id, video_webpage)
935 return
936
937 if 'length_seconds' not in video_info:
938 self._downloader.report_warning('unable to extract video duration')
939 video_duration = None
940 else:
941 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
942
943 # annotations
944 video_annotations = None
945 if self._downloader.params.get('writeannotations', False):
946 video_annotations = self._extract_annotations(video_id)
947
948 def _map_to_format_list(urlmap):
949 formats = []
950 for itag, video_real_url in urlmap.items():
951 dct = {
952 'format_id': itag,
953 'url': video_real_url,
954 'player_url': player_url,
955 }
956 if itag in self._formats:
957 dct.update(self._formats[itag])
958 formats.append(dct)
959 return formats
960
961 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
962 self.report_rtmp_download()
963 formats = [{
964 'format_id': '_rtmp',
965 'protocol': 'rtmp',
966 'url': video_info['conn'][0],
967 'player_url': player_url,
968 }]
969 elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
970 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
971 if 'rtmpe%3Dyes' in encoded_url_map:
972 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
973 url_map = {}
974 for url_data_str in encoded_url_map.split(','):
975 url_data = compat_parse_qs(url_data_str)
976 if 'itag' not in url_data or 'url' not in url_data:
977 continue
978 format_id = url_data['itag'][0]
979 url = url_data['url'][0]
980
981 if 'sig' in url_data:
982 url += '&signature=' + url_data['sig'][0]
983 elif 's' in url_data:
984 encrypted_sig = url_data['s'][0]
985
986 jsplayer_url_json = self._search_regex(
987 r'"assets":.+?"js":\s*("[^"]+")',
988 embed_webpage if age_gate else video_webpage, 'JS player URL')
989 player_url = json.loads(jsplayer_url_json)
990 if player_url is None:
991 player_url_json = self._search_regex(
992 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
993 video_webpage, 'age gate player URL')
994 player_url = json.loads(player_url_json)
995
996 if self._downloader.params.get('verbose'):
997 if player_url is None:
998 player_version = 'unknown'
999 player_desc = 'unknown'
1000 else:
1001 if player_url.endswith('swf'):
1002 player_version = self._search_regex(
1003 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
1004 'flash player', fatal=False)
1005 player_desc = 'flash player %s' % player_version
1006 else:
1007 player_version = self._search_regex(
1008 r'html5player-([^/]+?)(?:/html5player)?\.js',
1009 player_url,
1010 'html5 player', fatal=False)
1011 player_desc = 'html5 player %s' % player_version
1012
1013 parts_sizes = self._signature_cache_id(encrypted_sig)
1014 self.to_screen('{%s} signature length %s, %s' %
1015 (format_id, parts_sizes, player_desc))
1016
1017 signature = self._decrypt_signature(
1018 encrypted_sig, video_id, player_url, age_gate)
1019 url += '&signature=' + signature
1020 if 'ratebypass' not in url:
1021 url += '&ratebypass=yes'
1022 url_map[format_id] = url
1023 formats = _map_to_format_list(url_map)
1024 elif video_info.get('hlsvp'):
1025 manifest_url = video_info['hlsvp'][0]
1026 url_map = self._extract_from_m3u8(manifest_url, video_id)
1027 formats = _map_to_format_list(url_map)
1028 else:
1029 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1030
1031 # Look for the DASH manifest
1032 if self._downloader.params.get('youtube_include_dash_manifest', True):
1033 dash_mpd = video_info.get('dashmpd')
1034 if dash_mpd:
1035 dash_manifest_url = dash_mpd[0]
1036 try:
1037 dash_formats = self._parse_dash_manifest(
1038 video_id, dash_manifest_url, player_url, age_gate)
1039 except (ExtractorError, KeyError) as e:
1040 self.report_warning(
1041 'Skipping DASH manifest: %r' % e, video_id)
1042 else:
1043 formats.extend(dash_formats)
1044
1045 self._sort_formats(formats)
1046
1047 return {
1048 'id': video_id,
1049 'uploader': video_uploader,
1050 'uploader_id': video_uploader_id,
1051 'upload_date': upload_date,
1052 'title': video_title,
1053 'thumbnail': video_thumbnail,
1054 'description': video_description,
1055 'categories': video_categories,
1056 'subtitles': video_subtitles,
1057 'duration': video_duration,
1058 'age_limit': 18 if age_gate else 0,
1059 'annotations': video_annotations,
1060 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1061 'view_count': view_count,
1062 'like_count': like_count,
1063 'dislike_count': dislike_count,
1064 'formats': formats,
1065 }
1066
1067
1068class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1069 IE_DESC = 'YouTube.com playlists'
1070 _VALID_URL = r"""(?x)(?:
1071 (?:https?://)?
1072 (?:\w+\.)?
1073 youtube\.com/
1074 (?:
1075 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
1076 \? (?:.*?&)*? (?:p|a|list)=
1077 | p/
1078 )
1079 (
1080 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1081 # Top tracks, they can also include dots
1082 |(?:MC)[\w\.]*
1083 )
1084 .*
1085 |
1086 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1087 )"""
1088 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1089 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1090 IE_NAME = 'youtube:playlist'
1091 _TESTS = [{
1092 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1093 'info_dict': {
1094 'title': 'ytdl test PL',
1095 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1096 },
1097 'playlist_count': 3,
1098 }, {
1099 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1100 'info_dict': {
1101 'title': 'YDL_Empty_List',
1102 },
1103 'playlist_count': 0,
1104 }, {
1105 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1106 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1107 'info_dict': {
1108 'title': '29C3: Not my department',
1109 },
1110 'playlist_count': 95,
1111 }, {
1112 'note': 'issue #673',
1113 'url': 'PLBB231211A4F62143',
1114 'info_dict': {
1115 'title': '[OLD]Team Fortress 2 (Class-based LP)',
1116 },
1117 'playlist_mincount': 26,
1118 }, {
1119 'note': 'Large playlist',
1120 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1121 'info_dict': {
1122 'title': 'Uploads from Cauchemar',
1123 },
1124 'playlist_mincount': 799,
1125 }, {
1126 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1127 'info_dict': {
1128 'title': 'YDL_safe_search',
1129 },
1130 'playlist_count': 2,
1131 }, {
1132 'note': 'embedded',
1133 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1134 'playlist_count': 4,
1135 'info_dict': {
1136 'title': 'JODA15',
1137 }
1138 }, {
1139 'note': 'Embedded SWF player',
1140 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1141 'playlist_count': 4,
1142 'info_dict': {
1143 'title': 'JODA7',
1144 }
1145 }, {
1146 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
1147 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
1148 'info_dict': {
1149 'title': 'Uploads from Interstellar Movie',
1150 },
1151 'playlist_mincout': 21,
1152 }]
1153
1154 def _real_initialize(self):
1155 self._login()
1156
1157 def _ids_to_results(self, ids):
1158 return [
1159 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1160 for vid_id in ids]
1161
1162 def _extract_mix(self, playlist_id):
1163 # The mixes are generated from a a single video
1164 # the id of the playlist is just 'RD' + video_id
1165 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1166 webpage = self._download_webpage(
1167 url, playlist_id, 'Downloading Youtube mix')
1168 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1169 title_span = (
1170 search_title('playlist-title') or
1171 search_title('title long-title') or
1172 search_title('title'))
1173 title = clean_html(title_span)
1174 ids = orderedSet(re.findall(
1175 r'''(?xs)data-video-username=".*?".*?
1176 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1177 webpage))
1178 url_results = self._ids_to_results(ids)
1179
1180 return self.playlist_result(url_results, playlist_id, title)
1181
1182 def _real_extract(self, url):
1183 # Extract playlist id
1184 mobj = re.match(self._VALID_URL, url)
1185 if mobj is None:
1186 raise ExtractorError('Invalid URL: %s' % url)
1187 playlist_id = mobj.group(1) or mobj.group(2)
1188
1189 # Check if it's a video-specific URL
1190 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1191 if 'v' in query_dict:
1192 video_id = query_dict['v'][0]
1193 if self._downloader.params.get('noplaylist'):
1194 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1195 return self.url_result(video_id, 'Youtube', video_id=video_id)
1196 else:
1197 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1198
1199 if playlist_id.startswith('RD'):
1200 # Mixes require a custom extraction process
1201 return self._extract_mix(playlist_id)
1202 if playlist_id.startswith('TL'):
1203 raise ExtractorError('For downloading YouTube.com top lists, use '
1204 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1205
1206 url = self._TEMPLATE_URL % playlist_id
1207 page = self._download_webpage(url, playlist_id)
1208 more_widget_html = content_html = page
1209
1210 # Check if the playlist exists or is private
1211 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1212 raise ExtractorError(
1213 'The playlist doesn\'t exist or is private, use --username or '
1214 '--netrc to access it.',
1215 expected=True)
1216
1217 # Extract the video ids from the playlist pages
1218 ids = []
1219
1220 for page_num in itertools.count(1):
1221 matches = re.finditer(self._VIDEO_RE, content_html)
1222 # We remove the duplicates and the link with index 0
1223 # (it's not the first video of the playlist)
1224 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1225 ids.extend(new_ids)
1226
1227 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1228 if not mobj:
1229 break
1230
1231 more = self._download_json(
1232 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1233 'Downloading page #%s' % page_num,
1234 transform_source=uppercase_escape)
1235 content_html = more['content_html']
1236 if not content_html.strip():
1237 # Some webpages show a "Load more" button but they don't
1238 # have more videos
1239 break
1240 more_widget_html = more['load_more_widget_html']
1241
1242 playlist_title = self._html_search_regex(
1243 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1244 page, 'title')
1245
1246 url_results = self._ids_to_results(ids)
1247 return self.playlist_result(url_results, playlist_id, playlist_title)
1248
1249
1250class YoutubeTopListIE(YoutubePlaylistIE):
1251 IE_NAME = 'youtube:toplist'
1252 IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1253 ' (Example: "yttoplist:music:Top Tracks")')
1254 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1255 _TESTS = [{
1256 'url': 'yttoplist:music:Trending',
1257 'playlist_mincount': 5,
1258 'skip': 'Only works for logged-in users',
1259 }]
1260
1261 def _real_extract(self, url):
1262 mobj = re.match(self._VALID_URL, url)
1263 channel = mobj.group('chann')
1264 title = mobj.group('title')
1265 query = compat_urllib_parse.urlencode({'title': title})
1266 channel_page = self._download_webpage(
1267 'https://www.youtube.com/%s' % channel, title)
1268 link = self._html_search_regex(
1269 r'''(?x)
1270 <a\s+href="([^"]+)".*?>\s*
1271 <span\s+class="branded-page-module-title-text">\s*
1272 <span[^>]*>.*?%s.*?</span>''' % re.escape(query),
1273 channel_page, 'list')
1274 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1275
1276 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1277 ids = []
1278 # sometimes the webpage doesn't contain the videos
1279 # retry until we get them
1280 for i in itertools.count(0):
1281 msg = 'Downloading Youtube mix'
1282 if i > 0:
1283 msg += ', retry #%d' % i
1284
1285 webpage = self._download_webpage(url, title, msg)
1286 ids = orderedSet(re.findall(video_re, webpage))
1287 if ids:
1288 break
1289 url_results = self._ids_to_results(ids)
1290 return self.playlist_result(url_results, playlist_title=title)
1291
1292
1293class YoutubeChannelIE(InfoExtractor):
1294 IE_DESC = 'YouTube.com channels'
1295 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
1296 IE_NAME = 'youtube:channel'
1297 _TESTS = [{
1298 'note': 'paginated channel',
1299 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1300 'playlist_mincount': 91,
1301 }]
1302
1303 def extract_videos_from_page(self, page):
1304 ids_in_page = []
1305 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1306 if mobj.group(1) not in ids_in_page:
1307 ids_in_page.append(mobj.group(1))
1308 return ids_in_page
1309
1310 def _real_extract(self, url):
1311 channel_id = self._match_id(url)
1312
1313 video_ids = []
1314 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1315 channel_page = self._download_webpage(url, channel_id)
1316 autogenerated = re.search(r'''(?x)
1317 class="[^"]*?(?:
1318 channel-header-autogenerated-label|
1319 yt-channel-title-autogenerated
1320 )[^"]*"''', channel_page) is not None
1321
1322 if autogenerated:
1323 # The videos are contained in a single page
1324 # the ajax pages can't be used, they are empty
1325 video_ids = self.extract_videos_from_page(channel_page)
1326 entries = [
1327 self.url_result(video_id, 'Youtube', video_id=video_id)
1328 for video_id in video_ids]
1329 return self.playlist_result(entries, channel_id)
1330
1331 def _entries():
1332 more_widget_html = content_html = channel_page
1333 for pagenum in itertools.count(1):
1334
1335 ids_in_page = self.extract_videos_from_page(content_html)
1336 for video_id in ids_in_page:
1337 yield self.url_result(
1338 video_id, 'Youtube', video_id=video_id)
1339
1340 mobj = re.search(
1341 r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
1342 more_widget_html)
1343 if not mobj:
1344 break
1345
1346 more = self._download_json(
1347 'https://youtube.com/%s' % mobj.group('more'), channel_id,
1348 'Downloading page #%s' % (pagenum + 1),
1349 transform_source=uppercase_escape)
1350 content_html = more['content_html']
1351 more_widget_html = more['load_more_widget_html']
1352
1353 return self.playlist_result(_entries(), channel_id)
1354
1355
1356class YoutubeUserIE(InfoExtractor):
1357 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
1358 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
1359 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1360 _GDATA_PAGE_SIZE = 50
1361 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1362 IE_NAME = 'youtube:user'
1363
1364 _TESTS = [{
1365 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1366 'playlist_mincount': 320,
1367 'info_dict': {
1368 'title': 'TheLinuxFoundation',
1369 }
1370 }, {
1371 'url': 'ytuser:phihag',
1372 'only_matching': True,
1373 }]
1374
1375 @classmethod
1376 def suitable(cls, url):
1377 # Don't return True if the url can be extracted with other youtube
1378 # extractor, the regex would is too permissive and it would match.
1379 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1380 if any(ie.suitable(url) for ie in other_ies):
1381 return False
1382 else:
1383 return super(YoutubeUserIE, cls).suitable(url)
1384
1385 def _real_extract(self, url):
1386 username = self._match_id(url)
1387
1388 # Download video ids using YouTube Data API. Result size per
1389 # query is limited (currently to 50 videos) so we need to query
1390 # page by page until there are no video ids - it means we got
1391 # all of them.
1392
1393 def download_page(pagenum):
1394 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1395
1396 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1397 page = self._download_webpage(
1398 gdata_url, username,
1399 'Downloading video ids from %d to %d' % (
1400 start_index, start_index + self._GDATA_PAGE_SIZE))
1401
1402 try:
1403 response = json.loads(page)
1404 except ValueError as err:
1405 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
1406 if 'entry' not in response['feed']:
1407 return
1408
1409 # Extract video identifiers
1410 entries = response['feed']['entry']
1411 for entry in entries:
1412 title = entry['title']['$t']
1413 video_id = entry['id']['$t'].split('/')[-1]
1414 yield {
1415 '_type': 'url',
1416 'url': video_id,
1417 'ie_key': 'Youtube',
1418 'id': video_id,
1419 'title': title,
1420 }
1421 url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
1422
1423 return self.playlist_result(url_results, playlist_title=username)
1424
1425
1426class YoutubeSearchIE(SearchInfoExtractor):
1427 IE_DESC = 'YouTube.com searches'
1428 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1429 _MAX_RESULTS = 1000
1430 IE_NAME = 'youtube:search'
1431 _SEARCH_KEY = 'ytsearch'
1432
1433 def _get_n_results(self, query, n):
1434 """Get a specified number of results for a query"""
1435
1436 video_ids = []
1437 pagenum = 0
1438 limit = n
1439 PAGE_SIZE = 50
1440
1441 while (PAGE_SIZE * pagenum) < limit:
1442 result_url = self._API_URL % (
1443 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1444 (PAGE_SIZE * pagenum) + 1)
1445 data_json = self._download_webpage(
1446 result_url, video_id='query "%s"' % query,
1447 note='Downloading page %s' % (pagenum + 1),
1448 errnote='Unable to download API page')
1449 data = json.loads(data_json)
1450 api_response = data['data']
1451
1452 if 'items' not in api_response:
1453 raise ExtractorError(
1454 '[youtube] No video results', expected=True)
1455
1456 new_ids = list(video['id'] for video in api_response['items'])
1457 video_ids += new_ids
1458
1459 limit = min(n, api_response['totalItems'])
1460 pagenum += 1
1461
1462 if len(video_ids) > n:
1463 video_ids = video_ids[:n]
1464 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1465 for video_id in video_ids]
1466 return self.playlist_result(videos, query)
1467
1468
1469class YoutubeSearchDateIE(YoutubeSearchIE):
1470 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1471 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1472 _SEARCH_KEY = 'ytsearchdate'
1473 IE_DESC = 'YouTube.com searches, newest videos first'
1474
1475
1476class YoutubeSearchURLIE(InfoExtractor):
1477 IE_DESC = 'YouTube.com search URLs'
1478 IE_NAME = 'youtube:search_url'
1479 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1480 _TESTS = [{
1481 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1482 'playlist_mincount': 5,
1483 'info_dict': {
1484 'title': 'youtube-dl test video',
1485 }
1486 }]
1487
1488 def _real_extract(self, url):
1489 mobj = re.match(self._VALID_URL, url)
1490 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1491
1492 webpage = self._download_webpage(url, query)
1493 result_code = self._search_regex(
1494 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
1495
1496 part_codes = re.findall(
1497 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1498 entries = []
1499 for part_code in part_codes:
1500 part_title = self._html_search_regex(
1501 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1502 part_url_snippet = self._html_search_regex(
1503 r'(?s)href="([^"]+)"', part_code, 'item URL')
1504 part_url = compat_urlparse.urljoin(
1505 'https://www.youtube.com/', part_url_snippet)
1506 entries.append({
1507 '_type': 'url',
1508 'url': part_url,
1509 'title': part_title,
1510 })
1511
1512 return {
1513 '_type': 'playlist',
1514 'entries': entries,
1515 'title': query,
1516 }
1517
1518
1519class YoutubeShowIE(InfoExtractor):
1520 IE_DESC = 'YouTube.com (multi-season) shows'
1521 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
1522 IE_NAME = 'youtube:show'
1523 _TESTS = [{
1524 'url': 'http://www.youtube.com/show/airdisasters',
1525 'playlist_mincount': 3,
1526 'info_dict': {
1527 'id': 'airdisasters',
1528 'title': 'Air Disasters',
1529 }
1530 }]
1531
1532 def _real_extract(self, url):
1533 mobj = re.match(self._VALID_URL, url)
1534 playlist_id = mobj.group('id')
1535 webpage = self._download_webpage(
1536 url, playlist_id, 'Downloading show webpage')
1537 # There's one playlist for each season of the show
1538 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1539 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1540 entries = [
1541 self.url_result(
1542 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1543 for season in m_seasons
1544 ]
1545 title = self._og_search_title(webpage, fatal=False)
1546
1547 return {
1548 '_type': 'playlist',
1549 'id': playlist_id,
1550 'title': title,
1551 'entries': entries,
1552 }
1553
1554
1555class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1556 """
1557 Base class for extractors that fetch info from
1558 http://www.youtube.com/feed_ajax
1559 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1560 """
1561 _LOGIN_REQUIRED = True
1562 # use action_load_personal_feed instead of action_load_system_feed
1563 _PERSONAL_FEED = False
1564
1565 @property
1566 def _FEED_TEMPLATE(self):
1567 action = 'action_load_system_feed'
1568 if self._PERSONAL_FEED:
1569 action = 'action_load_personal_feed'
1570 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1571
1572 @property
1573 def IE_NAME(self):
1574 return 'youtube:%s' % self._FEED_NAME
1575
1576 def _real_initialize(self):
1577 self._login()
1578
1579 def _real_extract(self, url):
1580 feed_entries = []
1581 paging = 0
1582 for i in itertools.count(1):
1583 info = self._download_json(
1584 self._FEED_TEMPLATE % paging,
1585 '%s feed' % self._FEED_NAME,
1586 'Downloading page %s' % i,
1587 transform_source=uppercase_escape)
1588 feed_html = info.get('feed_html') or info.get('content_html')
1589 load_more_widget_html = info.get('load_more_widget_html') or feed_html
1590 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1591 ids = orderedSet(m.group(1) for m in m_ids)
1592 feed_entries.extend(
1593 self.url_result(video_id, 'Youtube', video_id=video_id)
1594 for video_id in ids)
1595 mobj = re.search(
1596 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1597 load_more_widget_html)
1598 if mobj is None:
1599 break
1600 paging = mobj.group('paging')
1601 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1602
1603
1604class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1605 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
1606 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1607 _FEED_NAME = 'recommended'
1608 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1609
1610
1611class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1612 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
1613 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1614 _FEED_NAME = 'watch_later'
1615 _PLAYLIST_TITLE = 'Youtube Watch Later'
1616 _PERSONAL_FEED = True
1617
1618
1619class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1620 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
1621 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1622 _FEED_NAME = 'history'
1623 _PERSONAL_FEED = True
1624 _PLAYLIST_TITLE = 'Youtube Watch History'
1625
1626
1627class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1628 IE_NAME = 'youtube:favorites'
1629 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
1630 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1631 _LOGIN_REQUIRED = True
1632
1633 def _real_extract(self, url):
1634 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1635 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
1636 return self.url_result(playlist_id, 'YoutubePlaylist')
1637
1638
1639class YoutubeSubscriptionsIE(YoutubePlaylistIE):
1640 IE_NAME = 'youtube:subscriptions'
1641 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1642 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1643 _TESTS = []
1644
1645 def _real_extract(self, url):
1646 title = 'Youtube Subscriptions'
1647 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1648
1649 # The extraction process is the same as for playlists, but the regex
1650 # for the video ids doesn't contain an index
1651 ids = []
1652 more_widget_html = content_html = page
1653
1654 for page_num in itertools.count(1):
1655 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1656 new_ids = orderedSet(matches)
1657 ids.extend(new_ids)
1658
1659 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1660 if not mobj:
1661 break
1662
1663 more = self._download_json(
1664 'https://youtube.com/%s' % mobj.group('more'), title,
1665 'Downloading page #%s' % page_num,
1666 transform_source=uppercase_escape)
1667 content_html = more['content_html']
1668 more_widget_html = more['load_more_widget_html']
1669
1670 return {
1671 '_type': 'playlist',
1672 'title': title,
1673 'entries': self._ids_to_results(ids),
1674 }
1675
1676
1677class YoutubeTruncatedURLIE(InfoExtractor):
1678 IE_NAME = 'youtube:truncated_url'
1679 IE_DESC = False # Do not list
1680 _VALID_URL = r'''(?x)
1681 (?:https?://)?[^/]+/watch\?(?:
1682 feature=[a-z_]+|
1683 annotation_id=annotation_[^&]+
1684 )?$|
1685 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1686 '''
1687
1688 _TESTS = [{
1689 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1690 'only_matching': True,
1691 }, {
1692 'url': 'http://www.youtube.com/watch?',
1693 'only_matching': True,
1694 }]
1695
1696 def _real_extract(self, url):
1697 raise ExtractorError(
1698 'Did you forget to quote the URL? Remember that & is a meta '
1699 'character in most shells, so you want to put the URL in quotes, '
1700 'like youtube-dl '
1701 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1702 ' or simply youtube-dl BaW_jenozKc .',
1703 expected=True)