]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
release 2014.12.01
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
42939b61 10import time
e0df6211 11import traceback
c5e8d7af 12
b05654f0 13from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 14from .subtitles import SubtitlesInfoExtractor
2b25cb5d 15from ..jsinterp import JSInterpreter
54256267 16from ..swfinterp import SWFInterpreter
c5e8d7af 17from ..utils import (
edf3e38e 18 compat_chr,
c5e8d7af 19 compat_parse_qs,
c5e8d7af
PH
20 compat_urllib_parse,
21 compat_urllib_request,
7c61bd36 22 compat_urlparse,
c5e8d7af
PH
23 compat_str,
24
25 clean_html,
26 get_element_by_id,
652cdaa2 27 get_element_by_attribute,
c5e8d7af 28 ExtractorError,
dd27fd17 29 int_or_none,
9c44d242 30 OnDemandPagedList,
c5e8d7af
PH
31 unescapeHTML,
32 unified_strdate,
04cc9617 33 orderedSet,
81c2f20b 34 uppercase_escape,
c5e8d7af
PH
35)
36
5f6a1245 37
de7f3446 38class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
39 """Provide base functions for Youtube extractors"""
40 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
83317f69 41 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
b2e8bc1b
JMF
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
45
b2e8bc1b 46 def _set_language(self):
42939b61
JMF
47 self._set_cookie('.youtube.com', 'PREF', 'f1=50000000&hl=en',
48 # YouTube sets the expire time to about two months
49 expire_time=time.time() + 60*24*3600)
b2e8bc1b
JMF
50
51 def _login(self):
83317f69 52 """
53 Attempt to log in to YouTube.
54 True is returned if successful or skipped.
55 False is returned if login failed.
56
57 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
58 """
b2e8bc1b
JMF
59 (username, password) = self._get_login_info()
60 # No authentication to be performed
61 if username is None:
62 if self._LOGIN_REQUIRED:
69ea8ca4 63 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 64 return True
b2e8bc1b 65
7cc3570e
PH
66 login_page = self._download_webpage(
67 self._LOGIN_URL, None,
69ea8ca4
PH
68 note='Downloading login page',
69 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
70 if login_page is False:
71 return
b2e8bc1b 72
795f28f8 73 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78caa52a 74 login_page, 'Login GALX parameter')
c5e8d7af 75
b2e8bc1b
JMF
76 # Log in
77 login_form_strs = {
8bcc8756
JW
78 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
79 'Email': username,
80 'GALX': galx,
81 'Passwd': password,
82
83 'PersistentCookie': 'yes',
84 '_utf8': '霱',
85 'bgresponse': 'js_disabled',
86 'checkConnection': '',
87 'checkedDomains': 'youtube',
88 'dnConn': '',
89 'pstMsg': '0',
90 'rmShown': '1',
91 'secTok': '',
92 'signIn': 'Sign in',
93 'timeStmp': '',
94 'service': 'youtube',
95 'uilel': '3',
96 'hl': 'en_US',
b2e8bc1b 97 }
83317f69 98
b2e8bc1b
JMF
99 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
100 # chokes on unicode
5f6a1245 101 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
b2e8bc1b 102 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
103
104 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
105 login_results = self._download_webpage(
106 req, None,
69ea8ca4 107 note='Logging in', errnote='unable to log in', fatal=False)
7cc3570e
PH
108 if login_results is False:
109 return False
83317f69 110
111 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
69ea8ca4 112 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
83317f69 113
114 # Two-Factor
115 # TODO add SMS and phone call support - these require making a request and then prompting the user
116
117 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
118 tfa_code = self._get_tfa_info()
119
120 if tfa_code is None:
69ea8ca4
PH
121 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
122 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
83317f69 123 return False
124
125 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
126
127 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
128 if match is None:
69ea8ca4 129 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
83317f69 130 secTok = match.group(1)
131 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
132 if match is None:
69ea8ca4 133 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
83317f69 134 timeStmp = match.group(1)
135
136 tfa_form_strs = {
78caa52a
PH
137 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
138 'smsToken': '',
139 'smsUserPin': tfa_code,
140 'smsVerifyPin': 'Verify',
141
142 'PersistentCookie': 'yes',
143 'checkConnection': '',
144 'checkedDomains': 'youtube',
145 'pstMsg': '1',
146 'secTok': secTok,
147 'timeStmp': timeStmp,
148 'service': 'youtube',
149 'hl': 'en_US',
83317f69 150 }
5f6a1245 151 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
83317f69 152 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
153
154 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
155 tfa_results = self._download_webpage(
156 tfa_req, None,
69ea8ca4 157 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
83317f69 158
159 if tfa_results is False:
160 return False
161
162 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
69ea8ca4 163 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
83317f69 164 return False
165 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
69ea8ca4 166 self._downloader.report_warning('unable to log in - did the page structure change?')
83317f69 167 return False
168 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
69ea8ca4 169 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
83317f69 170 return False
171
7cc3570e 172 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
69ea8ca4 173 self._downloader.report_warning('unable to log in: bad username or password')
b2e8bc1b
JMF
174 return False
175 return True
176
b2e8bc1b
JMF
177 def _real_initialize(self):
178 if self._downloader is None:
179 return
42939b61 180 self._set_language()
b2e8bc1b
JMF
181 if not self._login():
182 return
c5e8d7af 183
8377574c 184
de7f3446 185class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
78caa52a 186 IE_DESC = 'YouTube.com'
cb7dfeea 187 _VALID_URL = r"""(?x)^
c5e8d7af 188 (
edb53e2d 189 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 190 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 191 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 192 (?:www\.)?pwnyoutube\.com/|
f7000f3a 193 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
194 tube\.majestyc\.net/|
195 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
196 (?:.*?\#/)? # handle anchor (#/) redirect urls
197 (?: # the various things that can precede the ID:
ac7553d0 198 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 199 |(?: # or the v= param in all its forms
f7000f3a 200 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
201 (?:\?|\#!?) # the params delimiter ? or # or #!
202 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
203 v=
204 )
f4b05232
JMF
205 ))
206 |youtu\.be/ # just youtu.be/xxxx
edb53e2d 207 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 208 )
c5e8d7af 209 )? # all until now is optional -> you can pass the naked ID
8963d9c2 210 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
9291475f 211 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
c5e8d7af
PH
212 (?(1).+)? # if we found the ID, everything can follow
213 $"""
c5e8d7af 214 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
215 _formats = {
216 '5': {'ext': 'flv', 'width': 400, 'height': 240},
217 '6': {'ext': 'flv', 'width': 450, 'height': 270},
218 '13': {'ext': '3gp'},
219 '17': {'ext': '3gp', 'width': 176, 'height': 144},
220 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
221 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
222 '34': {'ext': 'flv', 'width': 640, 'height': 360},
223 '35': {'ext': 'flv', 'width': 854, 'height': 480},
224 '36': {'ext': '3gp', 'width': 320, 'height': 240},
225 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
226 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
227 '43': {'ext': 'webm', 'width': 640, 'height': 360},
228 '44': {'ext': 'webm', 'width': 854, 'height': 480},
229 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
230 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
231
1d043b93 232
86fe61c8 233 # 3d videos
43b81eb9
PH
234 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
235 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
236 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
237 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
238 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
239 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
240 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 241
96fb5605 242 # Apple HTTP Live Streaming
43b81eb9
PH
243 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
244 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
245 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
246 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
247 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
248 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
249 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
250
251 # DASH mp4 video
43b81eb9
PH
252 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
253 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
254 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
255 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
256 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
257 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
258 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
259 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
0d2c1418
PH
260 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
261 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
262 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
836a086c 263
f6f1fc92 264 # Dash mp4 audio
2c62dc26
PH
265 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
266 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
267 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
268
269 # Dash webm
e75cafe9
A
270 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
271 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
272 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
273 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
274 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
275 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
1cc887cb 276 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
e75cafe9
A
277 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
278 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
279 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
280 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
281 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
282 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
283 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 284 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 285 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
a51d3aa0
PH
286 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
287 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
10a404c3 288 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
2c62dc26
PH
289
290 # Dash webm audio
55db73ef 291 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
e75cafe9 292 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d 293
0857baad
PH
294 # Dash webm audio with opus inside
295 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
296 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
297 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
298
ce6b9a2d
PH
299 # RTMP (unnamed)
300 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 301 }
836a086c 302
78caa52a 303 IE_NAME = 'youtube'
2eb88d95
PH
304 _TESTS = [
305 {
4bc3a23e
PH
306 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
307 'info_dict': {
308 'id': 'BaW_jenozKc',
309 'ext': 'mp4',
310 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
311 'uploader': 'Philipp Hagemeister',
312 'uploader_id': 'phihag',
313 'upload_date': '20121002',
314 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
315 'categories': ['Science & Technology'],
3e7c1224
PH
316 'like_count': int,
317 'dislike_count': int,
2eb88d95 318 }
0e853ca4 319 },
0e853ca4 320 {
4bc3a23e
PH
321 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
322 'note': 'Test generic use_cipher_signature video (#897)',
323 'info_dict': {
324 'id': 'UxxajLWwzqY',
325 'ext': 'mp4',
326 'upload_date': '20120506',
327 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
328 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
329 'uploader': 'Icona Pop',
330 'uploader_id': 'IconaPop',
2eb88d95 331 }
c108eb73
JMF
332 },
333 {
4bc3a23e
PH
334 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
335 'note': 'Test VEVO video with age protection (#956)',
336 'info_dict': {
337 'id': '07FYdnEawAQ',
338 'ext': 'mp4',
339 'upload_date': '20130703',
340 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
341 'description': 'md5:64249768eec3bc4276236606ea996373',
342 'uploader': 'justintimberlakeVEVO',
343 'uploader_id': 'justintimberlakeVEVO',
c108eb73
JMF
344 }
345 },
fccd3771 346 {
4bc3a23e
PH
347 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
348 'note': 'Embed-only video (#1746)',
349 'info_dict': {
350 'id': 'yZIXLfi8CZQ',
351 'ext': 'mp4',
352 'upload_date': '20120608',
353 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
354 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
355 'uploader': 'SET India',
356 'uploader_id': 'setindia'
fccd3771
PH
357 }
358 },
dd27fd17 359 {
4bc3a23e
PH
360 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
361 'note': '256k DASH audio (format 141) via DASH manifest',
362 'info_dict': {
363 'id': 'a9LDPn-MO4I',
364 'ext': 'm4a',
365 'upload_date': '20121002',
366 'uploader_id': '8KVIDEO',
367 'description': '',
368 'uploader': '8KVIDEO',
369 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 370 },
4bc3a23e
PH
371 'params': {
372 'youtube_include_dash_manifest': True,
373 'format': '141',
4919603f 374 },
dd27fd17 375 },
3489b7d2
JMF
376 # DASH manifest with encrypted signature
377 {
78caa52a
PH
378 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
379 'info_dict': {
380 'id': 'IB3lcPjvWLA',
381 'ext': 'm4a',
b766eb27
JMF
382 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
383 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
78caa52a
PH
384 'uploader': 'AfrojackVEVO',
385 'uploader_id': 'AfrojackVEVO',
386 'upload_date': '20131011',
3489b7d2 387 },
4bc3a23e 388 'params': {
78caa52a
PH
389 'youtube_include_dash_manifest': True,
390 'format': '141',
3489b7d2
JMF
391 },
392 },
aa79ac0c
PH
393 # Controversy video
394 {
395 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
396 'info_dict': {
397 'id': 'T4XJQO3qol8',
398 'ext': 'mp4',
399 'upload_date': '20100909',
400 'uploader': 'The Amazing Atheist',
401 'uploader_id': 'TheAmazingAtheist',
402 'title': 'Burning Everyone\'s Koran',
403 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
404 }
c522adb1
JMF
405 },
406 # Normal age-gate video (No vevo, embed allowed)
407 {
408 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
409 'info_dict': {
410 'id': 'HtVdAasjOgU',
411 'ext': 'mp4',
412 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
413 'description': 'md5:eca57043abae25130f58f655ad9a7771',
414 'uploader': 'The Witcher',
415 'uploader_id': 'WitcherGame',
416 'upload_date': '20140605',
417 },
418 },
2eb88d95
PH
419 ]
420
e0df6211
PH
421 def __init__(self, *args, **kwargs):
422 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 423 self._player_cache = {}
e0df6211 424
c5e8d7af
PH
425 def report_video_info_webpage_download(self, video_id):
426 """Report attempt to download video info webpage."""
69ea8ca4 427 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 428
c5e8d7af
PH
429 def report_information_extraction(self, video_id):
430 """Report attempt to extract video information."""
69ea8ca4 431 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
432
433 def report_unavailable_format(self, video_id, format):
434 """Report extracted video URL."""
69ea8ca4 435 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
436
437 def report_rtmp_download(self):
438 """Indicate the download will use the RTMP protocol."""
69ea8ca4 439 self.to_screen('RTMP download detected')
c5e8d7af 440
60064c53
PH
441 def _signature_cache_id(self, example_sig):
442 """ Return a string representation of a signature """
78caa52a 443 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
444
445 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 446 id_m = re.match(
c081b35c 447 r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
cf010131 448 player_url)
c081b35c
PH
449 if not id_m:
450 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
451 player_type = id_m.group('ext')
452 player_id = id_m.group('id')
453
c4417ddb 454 # Read from filesystem cache
60064c53
PH
455 func_id = '%s_%s_%s' % (
456 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 457 assert os.path.basename(func_id) == func_id
a0e07d31 458
69ea8ca4 459 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 460 if cache_spec is not None:
78caa52a 461 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 462
e0df6211
PH
463 if player_type == 'js':
464 code = self._download_webpage(
465 player_url, video_id,
69ea8ca4
PH
466 note='Downloading %s player %s' % (player_type, player_id),
467 errnote='Download of %s failed' % player_url)
83799698 468 res = self._parse_sig_js(code)
c4417ddb 469 elif player_type == 'swf':
e0df6211
PH
470 urlh = self._request_webpage(
471 player_url, video_id,
69ea8ca4
PH
472 note='Downloading %s player %s' % (player_type, player_id),
473 errnote='Download of %s failed' % player_url)
e0df6211 474 code = urlh.read()
83799698 475 res = self._parse_sig_swf(code)
e0df6211
PH
476 else:
477 assert False, 'Invalid player type %r' % player_type
478
a0e07d31 479 if cache_spec is None:
78caa52a 480 test_string = ''.join(map(compat_chr, range(len(example_sig))))
a0e07d31
PH
481 cache_res = res(test_string)
482 cache_spec = [ord(c) for c in cache_res]
83799698 483
69ea8ca4 484 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
485 return res
486
60064c53 487 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
488 def gen_sig_code(idxs):
489 def _genslice(start, end, step):
78caa52a 490 starts = '' if start == 0 else str(start)
8bcc8756 491 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 492 steps = '' if step == 1 else (':%d' % step)
78caa52a 493 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
494
495 step = None
0ca96d48
PH
496 start = '(Never used)' # Quelch pyflakes warnings - start will be
497 # set as soon as step is set
edf3e38e
PH
498 for i, prev in zip(idxs[1:], idxs[:-1]):
499 if step is not None:
500 if i - prev == step:
501 continue
502 yield _genslice(start, prev, step)
503 step = None
504 continue
505 if i - prev in [-1, 1]:
506 step = i - prev
507 start = prev
508 continue
509 else:
78caa52a 510 yield 's[%d]' % prev
edf3e38e 511 if step is None:
78caa52a 512 yield 's[%d]' % i
edf3e38e
PH
513 else:
514 yield _genslice(start, i, step)
515
78caa52a 516 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 517 cache_res = func(test_string)
edf3e38e 518 cache_spec = [ord(c) for c in cache_res]
78caa52a 519 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
520 signature_id_tuple = '(%s)' % (
521 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 522 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 523 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 524 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 525
e0df6211
PH
526 def _parse_sig_js(self, jscode):
527 funcname = self._search_regex(
894dd868 528 r'\.sig\|\|([a-zA-Z0-9]+)\(', jscode,
8bcc8756 529 'Initial JS player signature function name')
2b25cb5d
PH
530
531 jsi = JSInterpreter(jscode)
532 initial_function = jsi.extract_function(funcname)
e0df6211
PH
533 return lambda s: initial_function([s])
534
535 def _parse_sig_swf(self, file_contents):
54256267 536 swfi = SWFInterpreter(file_contents)
78caa52a 537 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 538 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 539 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
540 return lambda s: initial_function([s])
541
83799698 542 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 543 """Turn the encrypted s field into a working signature"""
6b37f0be 544
c8bf86d5 545 if player_url is None:
69ea8ca4 546 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 547
69ea8ca4 548 if player_url.startswith('//'):
78caa52a 549 player_url = 'https:' + player_url
c8bf86d5 550 try:
62af3a0e 551 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
552 if player_id not in self._player_cache:
553 func = self._extract_signature_function(
60064c53 554 video_id, player_url, s
c8bf86d5
PH
555 )
556 self._player_cache[player_id] = func
557 func = self._player_cache[player_id]
558 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 559 self._print_sig_code(func, s)
c8bf86d5
PH
560 return func(s)
561 except Exception as e:
562 tb = traceback.format_exc()
563 raise ExtractorError(
78caa52a 564 'Signature extraction failed: ' + tb, cause=e)
e0df6211 565
1f343eaa 566 def _get_available_subtitles(self, video_id, webpage):
de7f3446 567 try:
7fad1c63 568 sub_list = self._download_webpage(
38c2e5b8 569 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
570 video_id, note=False)
571 except ExtractorError as err:
69ea8ca4 572 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
de7f3446
JMF
573 return {}
574 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
575
576 sub_lang_list = {}
577 for l in lang_list:
578 lang = l[1]
7e660ac1
LD
579 if lang in sub_lang_list:
580 continue
de7f3446
JMF
581 params = compat_urllib_parse.urlencode({
582 'lang': lang,
583 'v': video_id,
ca715127 584 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
c3197e3e 585 'name': unescapeHTML(l[0]).encode('utf-8'),
de7f3446 586 })
78caa52a 587 url = 'https://www.youtube.com/api/timedtext?' + params
de7f3446
JMF
588 sub_lang_list[lang] = url
589 if not sub_lang_list:
69ea8ca4 590 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
591 return {}
592 return sub_lang_list
593
055e6f36 594 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
595 """We need the webpage for getting the captions url, pass it as an
596 argument to speed up the process."""
ca715127 597 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
69ea8ca4 598 self.to_screen('%s: Looking for automatic captions' % video_id)
de7f3446 599 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
78caa52a 600 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
601 if mobj is None:
602 self._downloader.report_warning(err_msg)
603 return {}
604 player_config = json.loads(mobj.group(1))
605 try:
0792d563
PH
606 args = player_config['args']
607 caption_url = args['ttsurl']
608 timestamp = args['timestamp']
055e6f36
JMF
609 # We get the available subtitles
610 list_params = compat_urllib_parse.urlencode({
611 'type': 'list',
612 'tlangs': 1,
613 'asrs': 1,
de7f3446 614 })
055e6f36 615 list_url = caption_url + '&' + list_params
e26f8712 616 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 617 original_lang_node = caption_list.find('track')
5f6a1245 618 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr':
69ea8ca4 619 self._downloader.report_warning('Video doesn\'t have automatic captions')
e3dc22ca
JMF
620 return {}
621 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
622
623 sub_lang_list = {}
624 for lang_node in caption_list.findall('target'):
625 sub_lang = lang_node.attrib['lang_code']
626 params = compat_urllib_parse.urlencode({
627 'lang': original_lang,
628 'tlang': sub_lang,
629 'fmt': sub_format,
630 'ts': timestamp,
631 'kind': 'asr',
632 })
633 sub_lang_list[sub_lang] = caption_url + '&' + params
634 return sub_lang_list
de7f3446
JMF
635 # An extractor error can be raise by the download process if there are
636 # no automatic captions but there are subtitles
637 except (KeyError, ExtractorError):
638 self._downloader.report_warning(err_msg)
639 return {}
640
97665381
PH
641 @classmethod
642 def extract_id(cls, url):
643 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 644 if mobj is None:
69ea8ca4 645 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
646 video_id = mobj.group(2)
647 return video_id
648
1d043b93
JMF
649 def _extract_from_m3u8(self, manifest_url, video_id):
650 url_map = {}
5f6a1245 651
1d043b93
JMF
652 def _get_urls(_manifest):
653 lines = _manifest.split('\n')
654 urls = filter(lambda l: l and not l.startswith('#'),
8bcc8756 655 lines)
1d043b93 656 return urls
78caa52a 657 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
1d043b93
JMF
658 formats_urls = _get_urls(manifest)
659 for format_url in formats_urls:
890f62e8 660 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
661 url_map[itag] = format_url
662 return url_map
663
1fb07d10
JG
664 def _extract_annotations(self, video_id):
665 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 666 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 667
c5e8d7af 668 def _real_extract(self, url):
7e8c0af0 669 proto = (
78caa52a
PH
670 'http' if self._downloader.params.get('prefer_insecure', False)
671 else 'https')
7e8c0af0 672
c5e8d7af
PH
673 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
674 mobj = re.search(self._NEXT_URL_RE, url)
675 if mobj:
7e8c0af0 676 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
97665381 677 video_id = self.extract_id(url)
c5e8d7af
PH
678
679 # Get video webpage
aa79ac0c 680 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
a1f934b1 681 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
682
683 # Attempt to extract SWF player URL
e0df6211 684 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
685 if mobj is not None:
686 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
687 else:
688 player_url = None
689
690 # Get video info
c108eb73 691 if re.search(r'player-age-gate-content">', video_webpage) is not None:
c108eb73
JMF
692 age_gate = True
693 # We simulate the access to the video from www.youtube.com/v/{video_id}
694 # this can be viewed without login into Youtube
2c57c7fa
JMF
695 data = compat_urllib_parse.urlencode({
696 'video_id': video_id,
697 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934 698 'sts': self._search_regex(
94bd3613 699 r'"sts"\s*:\s*(\d+)', video_webpage, 'sts', default=''),
2c57c7fa 700 })
7e8c0af0 701 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
94bd3613
PH
702 video_info_webpage = self._download_webpage(
703 video_info_url, video_id,
20436c30 704 note='Refetching age-gated info webpage',
94bd3613 705 errnote='unable to download video info webpage')
c5e8d7af 706 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
707 else:
708 age_gate = False
4e62ebe2
JMF
709 try:
710 # Try looking directly into the video webpage
711 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
712 if not mobj:
713 raise ValueError('Could not find ytplayer.config') # caught below
714 json_code = uppercase_escape(mobj.group(1))
715 ytplayer_config = json.loads(json_code)
716 args = ytplayer_config['args']
717 # Convert to the same format returned by compat_parse_qs
718 video_info = dict((k, [v]) for k, v in args.items())
719 if 'url_encoded_fmt_stream_map' not in args:
720 raise ValueError('No stream_map present') # caught below
721 except ValueError:
722 # We fallback to the get_video_info pages (used by the embed page)
723 self.report_video_info_webpage_download(video_id)
724 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
725 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
726 % (video_id, el_type))
727 video_info_webpage = self._download_webpage(video_info_url,
728 video_id, note=False,
729 errnote='unable to download video info webpage')
730 video_info = compat_parse_qs(video_info_webpage)
731 if 'token' in video_info:
732 break
c5e8d7af
PH
733 if 'token' not in video_info:
734 if 'reason' in video_info:
d11271dd 735 raise ExtractorError(
78caa52a 736 'YouTube said: %s' % video_info['reason'][0],
d11271dd 737 expected=True, video_id=video_id)
c5e8d7af 738 else:
d11271dd 739 raise ExtractorError(
78caa52a 740 '"token" parameter not in video info for unknown reason',
d11271dd 741 video_id=video_id)
c5e8d7af 742
1d699755
PH
743 if 'view_count' in video_info:
744 view_count = int(video_info['view_count'][0])
745 else:
746 view_count = None
747
c5e8d7af
PH
748 # Check for "rental" videos
749 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
69ea8ca4 750 raise ExtractorError('"rental" videos not supported')
c5e8d7af
PH
751
752 # Start extracting information
753 self.report_information_extraction(video_id)
754
755 # uploader
756 if 'author' not in video_info:
69ea8ca4 757 raise ExtractorError('Unable to extract uploader name')
c5e8d7af
PH
758 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
759
760 # uploader_id
761 video_uploader_id = None
762 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
763 if mobj is not None:
764 video_uploader_id = mobj.group(1)
765 else:
69ea8ca4 766 self._downloader.report_warning('unable to extract uploader nickname')
c5e8d7af
PH
767
768 # title
a8c6b241 769 if 'title' in video_info:
aa92f063 770 video_title = video_info['title'][0]
a8c6b241 771 else:
69ea8ca4 772 self._downloader.report_warning('Unable to extract video title')
78caa52a 773 video_title = '_'
c5e8d7af
PH
774
775 # thumbnail image
7763b04e
JMF
776 # We try first to get a high quality image:
777 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
778 video_webpage, re.DOTALL)
779 if m_thumb is not None:
780 video_thumbnail = m_thumb.group(1)
781 elif 'thumbnail_url' not in video_info:
69ea8ca4 782 self._downloader.report_warning('unable to extract video thumbnail')
f490e77e 783 video_thumbnail = None
c5e8d7af
PH
784 else: # don't panic if we can't find it
785 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
786
787 # upload date
788 upload_date = None
ad3bc6ac 789 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
beee53de
PH
790 if mobj is None:
791 mobj = re.search(
263bd4ec 792 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
beee53de 793 video_webpage)
c5e8d7af
PH
794 if mobj is not None:
795 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
796 upload_date = unified_strdate(upload_date)
797
55f7bd2d
PH
798 m_cat_container = self._search_regex(
799 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
800 video_webpage, 'categories', fatal=False)
ec8deefc 801 if m_cat_container:
ad3bc6ac 802 category = self._html_search_regex(
01ed5c9b 803 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
804 default=None)
805 video_categories = None if category is None else [category]
806 else:
807 video_categories = None
ec8deefc 808
c5e8d7af
PH
809 # description
810 video_description = get_element_by_id("eow-description", video_webpage)
811 if video_description:
27dcce19
PH
812 video_description = re.sub(r'''(?x)
813 <a\s+
814 (?:[a-zA-Z-]+="[^"]+"\s+)*?
815 title="([^"]+)"\s+
816 (?:[a-zA-Z-]+="[^"]+"\s+)*?
817 class="yt-uix-redirect-link"\s*>
818 [^<]+
819 </a>
820 ''', r'\1', video_description)
c5e8d7af
PH
821 video_description = clean_html(video_description)
822 else:
823 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
824 if fd_mobj:
825 video_description = unescapeHTML(fd_mobj.group(1))
826 else:
78caa52a 827 video_description = ''
c5e8d7af 828
f30a38be 829 def _extract_count(count_name):
46374a56 830 count = self._search_regex(
f30a38be
JMF
831 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
832 video_webpage, count_name, default=None)
336c3a69
JMF
833 if count is not None:
834 return int(count.replace(',', ''))
835 return None
69ea8ca4
PH
836 like_count = _extract_count('like')
837 dislike_count = _extract_count('dislike')
336c3a69 838
c5e8d7af 839 # subtitles
d82134c3 840 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 841
c5e8d7af 842 if self._downloader.params.get('listsubtitles', False):
d665f8d3 843 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
844 return
845
846 if 'length_seconds' not in video_info:
69ea8ca4 847 self._downloader.report_warning('unable to extract video duration')
b466b702 848 video_duration = None
c5e8d7af 849 else:
b466b702 850 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 851
1fb07d10
JG
852 # annotations
853 video_annotations = None
854 if self._downloader.params.get('writeannotations', False):
5f6a1245 855 video_annotations = self._extract_annotations(video_id)
1fb07d10 856
dd27fd17
PH
857 def _map_to_format_list(urlmap):
858 formats = []
859 for itag, video_real_url in urlmap.items():
860 dct = {
861 'format_id': itag,
862 'url': video_real_url,
863 'player_url': player_url,
864 }
0b65e5d4
PH
865 if itag in self._formats:
866 dct.update(self._formats[itag])
dd27fd17
PH
867 formats.append(dct)
868 return formats
869
c5e8d7af
PH
870 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
871 self.report_rtmp_download()
dd27fd17
PH
872 formats = [{
873 'format_id': '_rtmp',
874 'protocol': 'rtmp',
875 'url': video_info['conn'][0],
876 'player_url': player_url,
877 }]
00fe14fc 878 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
5f6a1245 879 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 880 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 881 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 882 url_map = {}
00fe14fc 883 for url_data_str in encoded_url_map.split(','):
c5e8d7af 884 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
885 if 'itag' not in url_data or 'url' not in url_data:
886 continue
887 format_id = url_data['itag'][0]
888 url = url_data['url'][0]
889
890 if 'sig' in url_data:
891 url += '&signature=' + url_data['sig'][0]
892 elif 's' in url_data:
893 encrypted_sig = url_data['s'][0]
894
895 if not age_gate:
896 jsplayer_url_json = self._search_regex(
897 r'"assets":.+?"js":\s*("[^"]+")',
78caa52a 898 video_webpage, 'JS player URL')
201e9eaa
PH
899 player_url = json.loads(jsplayer_url_json)
900 if player_url is None:
901 player_url_json = self._search_regex(
902 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 903 video_webpage, 'age gate player URL')
201e9eaa
PH
904 player_url = json.loads(player_url_json)
905
906 if self._downloader.params.get('verbose'):
cf010131 907 if player_url is None:
201e9eaa
PH
908 player_version = 'unknown'
909 player_desc = 'unknown'
910 else:
911 if player_url.endswith('swf'):
912 player_version = self._search_regex(
913 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 914 'flash player', fatal=False)
201e9eaa 915 player_desc = 'flash player %s' % player_version
cf010131 916 else:
201e9eaa
PH
917 player_version = self._search_regex(
918 r'html5player-([^/]+?)(?:/html5player)?\.js',
919 player_url,
920 'html5 player', fatal=False)
78caa52a 921 player_desc = 'html5 player %s' % player_version
201e9eaa 922
60064c53 923 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 924 self.to_screen('{%s} signature length %s, %s' %
9e1a5b84 925 (format_id, parts_sizes, player_desc))
201e9eaa
PH
926
927 signature = self._decrypt_signature(
928 encrypted_sig, video_id, player_url, age_gate)
929 url += '&signature=' + signature
930 if 'ratebypass' not in url:
931 url += '&ratebypass=yes'
932 url_map[format_id] = url
dd27fd17 933 formats = _map_to_format_list(url_map)
1d043b93
JMF
934 elif video_info.get('hlsvp'):
935 manifest_url = video_info['hlsvp'][0]
936 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 937 formats = _map_to_format_list(url_map)
c5e8d7af 938 else:
69ea8ca4 939 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 940
dd27fd17 941 # Look for the DASH manifest
203fb43f 942 if self._downloader.params.get('youtube_include_dash_manifest', True):
dd27fd17 943 try:
d68f0cdb 944 # The DASH manifest used needs to be the one from the original video_webpage.
945 # The one found in get_video_info seems to be using different signatures.
946 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
947 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
948 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
7160532d 949 dash_manifest_url = video_info.get('dashmpd')[0]
5f6a1245 950
d68f0cdb 951 def decrypt_sig(mobj):
952 s = mobj.group(1)
953 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
954 return '/signature/%s' % dec_s
955 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
dd27fd17 956 dash_doc = self._download_xml(
d68f0cdb 957 dash_manifest_url, video_id,
69ea8ca4
PH
958 note='Downloading DASH manifest',
959 errnote='Could not download DASH manifest')
960 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
dd27fd17
PH
961 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
962 if url_el is None:
963 continue
964 format_id = r.attrib['id']
965 video_url = url_el.text
966 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
967 f = {
968 'format_id': format_id,
969 'url': video_url,
970 'width': int_or_none(r.attrib.get('width')),
971 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
972 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
973 'filesize': filesize,
974 }
975 try:
976 existing_format = next(
977 fo for fo in formats
978 if fo['format_id'] == format_id)
979 except StopIteration:
980 f.update(self._formats.get(format_id, {}))
981 formats.append(f)
982 else:
983 existing_format.update(f)
984
985 except (ExtractorError, KeyError) as e:
23ad44b5 986 self.report_warning('Skipping DASH manifest: %r' % e, video_id)
d80044c2 987
4bcc7bd1 988 self._sort_formats(formats)
4ea3be0a 989
990 return {
8bcc8756
JW
991 'id': video_id,
992 'uploader': video_uploader,
993 'uploader_id': video_uploader_id,
994 'upload_date': upload_date,
995 'title': video_title,
996 'thumbnail': video_thumbnail,
997 'description': video_description,
998 'categories': video_categories,
999 'subtitles': video_subtitles,
1000 'duration': video_duration,
1001 'age_limit': 18 if age_gate else 0,
1002 'annotations': video_annotations,
7e8c0af0 1003 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 1004 'view_count': view_count,
4ea3be0a 1005 'like_count': like_count,
1006 'dislike_count': dislike_count,
8bcc8756 1007 'formats': formats,
4ea3be0a 1008 }
c5e8d7af 1009
5f6a1245 1010
880e1c52 1011class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
78caa52a 1012 IE_DESC = 'YouTube.com playlists'
d67cc9fa 1013 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1014 (?:https?://)?
1015 (?:\w+\.)?
1016 youtube\.com/
1017 (?:
ac7553d0 1018 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
c5e8d7af
PH
1019 \? (?:.*?&)*? (?:p|a|list)=
1020 | p/
1021 )
d67cc9fa 1022 (
7d568f5a 1023 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
5f6a1245 1024 # Top tracks, they can also include dots
d67cc9fa
JMF
1025 |(?:MC)[\w\.]*
1026 )
c5e8d7af
PH
1027 .*
1028 |
7d568f5a 1029 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1030 )"""
dbb94fb0 1031 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dcbb4580 1032 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
dbb94fb0 1033 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
78caa52a 1034 IE_NAME = 'youtube:playlist'
81127aa5
PH
1035 _TESTS = [{
1036 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1037 'info_dict': {
1038 'title': 'ytdl test PL',
a1cf99d0 1039 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
81127aa5
PH
1040 },
1041 'playlist_count': 3,
9291475f
PH
1042 }, {
1043 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1044 'info_dict': {
1045 'title': 'YDL_Empty_List',
1046 },
1047 'playlist_count': 0,
1048 }, {
1049 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1050 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1051 'info_dict': {
1052 'title': '29C3: Not my department',
1053 },
1054 'playlist_count': 95,
1055 }, {
1056 'note': 'issue #673',
1057 'url': 'PLBB231211A4F62143',
1058 'info_dict': {
f46a8702 1059 'title': '[OLD]Team Fortress 2 (Class-based LP)',
9291475f
PH
1060 },
1061 'playlist_mincount': 26,
1062 }, {
1063 'note': 'Large playlist',
1064 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1065 'info_dict': {
1066 'title': 'Uploads from Cauchemar',
1067 },
1068 'playlist_mincount': 799,
1069 }, {
1070 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1071 'info_dict': {
1072 'title': 'YDL_safe_search',
1073 },
1074 'playlist_count': 2,
ac7553d0
PH
1075 }, {
1076 'note': 'embedded',
1077 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1078 'playlist_count': 4,
1079 'info_dict': {
1080 'title': 'JODA15',
1081 }
6b08cdf6
PH
1082 }, {
1083 'note': 'Embedded SWF player',
1084 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1085 'playlist_count': 4,
1086 'info_dict': {
1087 'title': 'JODA7',
1088 }
81127aa5 1089 }]
c5e8d7af 1090
880e1c52
JMF
1091 def _real_initialize(self):
1092 self._login()
1093
652cdaa2 1094 def _ids_to_results(self, ids):
c9cc0bf5
PH
1095 return [
1096 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1097 for vid_id in ids]
652cdaa2
JMF
1098
1099 def _extract_mix(self, playlist_id):
1100 # The mixes are generated from a a single video
1101 # the id of the playlist is just 'RD' + video_id
7d4afc55 1102 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
c9cc0bf5 1103 webpage = self._download_webpage(
78caa52a 1104 url, playlist_id, 'Downloading Youtube mix')
bc2f773b 1105 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
1106 title_span = (
1107 search_title('playlist-title') or
1108 search_title('title long-title') or
1109 search_title('title'))
76d1700b 1110 title = clean_html(title_span)
c9cc0bf5
PH
1111 ids = orderedSet(re.findall(
1112 r'''(?xs)data-video-username=".*?".*?
1113 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1114 webpage))
652cdaa2
JMF
1115 url_results = self._ids_to_results(ids)
1116
1117 return self.playlist_result(url_results, playlist_id, title)
1118
c5e8d7af
PH
1119 def _real_extract(self, url):
1120 # Extract playlist id
d67cc9fa 1121 mobj = re.match(self._VALID_URL, url)
c5e8d7af 1122 if mobj is None:
69ea8ca4 1123 raise ExtractorError('Invalid URL: %s' % url)
47192f92
FV
1124 playlist_id = mobj.group(1) or mobj.group(2)
1125
1126 # Check if it's a video-specific URL
7c61bd36 1127 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1128 if 'v' in query_dict:
1129 video_id = query_dict['v'][0]
1130 if self._downloader.params.get('noplaylist'):
69ea8ca4 1131 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1132 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92 1133 else:
69ea8ca4 1134 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1135
7d4afc55 1136 if playlist_id.startswith('RD'):
652cdaa2
JMF
1137 # Mixes require a custom extraction process
1138 return self._extract_mix(playlist_id)
0a688bc0 1139 if playlist_id.startswith('TL'):
69ea8ca4 1140 raise ExtractorError('For downloading YouTube.com top lists, use '
8bcc8756 1141 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1142
dbb94fb0
S
1143 url = self._TEMPLATE_URL % playlist_id
1144 page = self._download_webpage(url, playlist_id)
1145 more_widget_html = content_html = page
1146
10c0e2d8 1147 # Check if the playlist exists or is private
e399853d 1148 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
10c0e2d8 1149 raise ExtractorError(
78caa52a 1150 'The playlist doesn\'t exist or is private, use --username or '
10c0e2d8
JMF
1151 '--netrc to access it.',
1152 expected=True)
1153
dcbb4580
JMF
1154 # Extract the video ids from the playlist pages
1155 ids = []
c5e8d7af 1156
755eb032 1157 for page_num in itertools.count(1):
dbb94fb0 1158 matches = re.finditer(self._VIDEO_RE, content_html)
6e47b51e
JMF
1159 # We remove the duplicates and the link with index 0
1160 # (it's not the first video of the playlist)
1161 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1162 ids.extend(new_ids)
c5e8d7af 1163
dbb94fb0
S
1164 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1165 if not mobj:
c5e8d7af
PH
1166 break
1167
dbb94fb0 1168 more = self._download_json(
5912c639
PH
1169 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1170 'Downloading page #%s' % page_num,
1171 transform_source=uppercase_escape)
dbb94fb0
S
1172 content_html = more['content_html']
1173 more_widget_html = more['load_more_widget_html']
1174
1175 playlist_title = self._html_search_regex(
68eb8e90 1176 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
78caa52a 1177 page, 'title')
c5e8d7af 1178
652cdaa2 1179 url_results = self._ids_to_results(ids)
dcbb4580 1180 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1181
1182
0a688bc0 1183class YoutubeTopListIE(YoutubePlaylistIE):
78caa52a 1184 IE_NAME = 'youtube:toplist'
69ea8ca4 1185 IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"'
9e1a5b84 1186 ' (Example: "yttoplist:music:Top Tracks")')
0a688bc0 1187 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
cdc628a4
PH
1188 _TESTS = [{
1189 'url': 'yttoplist:music:Trending',
1190 'playlist_mincount': 5,
1191 'skip': 'Only works for logged-in users',
1192 }]
0a688bc0
JMF
1193
1194 def _real_extract(self, url):
1195 mobj = re.match(self._VALID_URL, url)
1196 channel = mobj.group('chann')
1197 title = mobj.group('title')
1198 query = compat_urllib_parse.urlencode({'title': title})
cdc628a4
PH
1199 channel_page = self._download_webpage(
1200 'https://www.youtube.com/%s' % channel, title)
1201 link = self._html_search_regex(
1202 r'''(?x)
1203 <a\s+href="([^"]+)".*?>\s*
1204 <span\s+class="branded-page-module-title-text">\s*
1205 <span[^>]*>.*?%s.*?</span>''' % re.escape(query),
1206 channel_page, 'list')
0a688bc0 1207 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
5f6a1245 1208
0a688bc0
JMF
1209 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1210 ids = []
1211 # sometimes the webpage doesn't contain the videos
1212 # retry until we get them
1213 for i in itertools.count(0):
78caa52a 1214 msg = 'Downloading Youtube mix'
0a688bc0
JMF
1215 if i > 0:
1216 msg += ', retry #%d' % i
c9cc0bf5 1217
0a688bc0
JMF
1218 webpage = self._download_webpage(url, title, msg)
1219 ids = orderedSet(re.findall(video_re, webpage))
1220 if ids:
1221 break
1222 url_results = self._ids_to_results(ids)
1223 return self.playlist_result(url_results, playlist_title=title)
1224
1225
c5e8d7af 1226class YoutubeChannelIE(InfoExtractor):
78caa52a 1227 IE_DESC = 'YouTube.com channels'
c5e8d7af 1228 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1229 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
38c2e5b8 1230 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
78caa52a 1231 IE_NAME = 'youtube:channel'
cdc628a4
PH
1232 _TESTS = [{
1233 'note': 'paginated channel',
1234 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1235 'playlist_mincount': 91,
1236 }]
c5e8d7af
PH
1237
1238 def extract_videos_from_page(self, page):
1239 ids_in_page = []
1240 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1241 if mobj.group(1) not in ids_in_page:
1242 ids_in_page.append(mobj.group(1))
1243 return ids_in_page
1244
1245 def _real_extract(self, url):
1246 # Extract channel id
1247 mobj = re.match(self._VALID_URL, url)
1248 if mobj is None:
69ea8ca4 1249 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
1250
1251 # Download channel page
1252 channel_id = mobj.group(1)
1253 video_ids = []
b9643eed
JMF
1254 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1255 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1256 autogenerated = re.search(r'''(?x)
1257 class="[^"]*?(?:
1258 channel-header-autogenerated-label|
1259 yt-channel-title-autogenerated
1260 )[^"]*"''', channel_page) is not None
c5e8d7af 1261
b9643eed
JMF
1262 if autogenerated:
1263 # The videos are contained in a single page
1264 # the ajax pages can't be used, they are empty
1265 video_ids = self.extract_videos_from_page(channel_page)
1266 else:
1267 # Download all channel pages using the json-based channel_ajax query
1268 for pagenum in itertools.count(1):
1269 url = self._MORE_PAGES_URL % (pagenum, channel_id)
81c2f20b 1270 page = self._download_json(
69ea8ca4 1271 url, channel_id, note='Downloading page #%s' % pagenum,
81c2f20b
PH
1272 transform_source=uppercase_escape)
1273
b9643eed
JMF
1274 ids_in_page = self.extract_videos_from_page(page['content_html'])
1275 video_ids.extend(ids_in_page)
5f6a1245 1276
b9643eed
JMF
1277 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1278 break
c5e8d7af 1279
69ea8ca4 1280 self._downloader.to_screen('[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
c5e8d7af 1281
7012b23c
PH
1282 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1283 for video_id in video_ids]
1284 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1285
1286
1287class YoutubeUserIE(InfoExtractor):
78caa52a 1288 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
c9ae7b95 1289 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
38c2e5b8 1290 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
c5e8d7af 1291 _GDATA_PAGE_SIZE = 50
38c2e5b8 1292 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
78caa52a 1293 IE_NAME = 'youtube:user'
c5e8d7af 1294
cdc628a4
PH
1295 _TESTS = [{
1296 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1297 'playlist_mincount': 320,
1298 'info_dict': {
1299 'title': 'TheLinuxFoundation',
1300 }
1301 }, {
1302 'url': 'ytuser:phihag',
1303 'only_matching': True,
1304 }]
1305
e3ea4790 1306 @classmethod
f4b05232 1307 def suitable(cls, url):
e3ea4790
JMF
1308 # Don't return True if the url can be extracted with other youtube
1309 # extractor, the regex would is too permissive and it would match.
1310 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
5f6a1245
JW
1311 if any(ie.suitable(url) for ie in other_ies):
1312 return False
1313 else:
1314 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 1315
c5e8d7af
PH
1316 def _real_extract(self, url):
1317 # Extract username
1318 mobj = re.match(self._VALID_URL, url)
1319 if mobj is None:
69ea8ca4 1320 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
1321
1322 username = mobj.group(1)
1323
1324 # Download video ids using YouTube Data API. Result size per
1325 # query is limited (currently to 50 videos) so we need to query
1326 # page by page until there are no video ids - it means we got
1327 # all of them.
1328
b7ab0590 1329 def download_page(pagenum):
c5e8d7af
PH
1330 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1331
1332 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1333 page = self._download_webpage(
1334 gdata_url, username,
78caa52a 1335 'Downloading video ids from %d to %d' % (
b7ab0590 1336 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1337
fd9cf738
JMF
1338 try:
1339 response = json.loads(page)
1340 except ValueError as err:
69ea8ca4 1341 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
71c82637 1342 if 'entry' not in response['feed']:
b7ab0590 1343 return
fd9cf738 1344
c5e8d7af 1345 # Extract video identifiers
e302f9ce
PH
1346 entries = response['feed']['entry']
1347 for entry in entries:
1348 title = entry['title']['$t']
1349 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1350 yield {
e302f9ce
PH
1351 '_type': 'url',
1352 'url': video_id,
1353 'ie_key': 'Youtube',
b11cec41 1354 'id': video_id,
e302f9ce 1355 'title': title,
b7ab0590 1356 }
9c44d242 1357 url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1358
7012b23c
PH
1359 return self.playlist_result(url_results, playlist_title=username)
1360
b05654f0
PH
1361
1362class YoutubeSearchIE(SearchInfoExtractor):
78caa52a
PH
1363 IE_DESC = 'YouTube.com searches'
1364 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
b05654f0 1365 _MAX_RESULTS = 1000
78caa52a 1366 IE_NAME = 'youtube:search'
b05654f0
PH
1367 _SEARCH_KEY = 'ytsearch'
1368
b05654f0
PH
1369 def _get_n_results(self, query, n):
1370 """Get a specified number of results for a query"""
1371
1372 video_ids = []
1373 pagenum = 0
1374 limit = n
83d548ef 1375 PAGE_SIZE = 50
b05654f0 1376
83d548ef
PH
1377 while (PAGE_SIZE * pagenum) < limit:
1378 result_url = self._API_URL % (
1379 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1380 (PAGE_SIZE * pagenum) + 1)
7cc3570e 1381 data_json = self._download_webpage(
69ea8ca4
PH
1382 result_url, video_id='query "%s"' % query,
1383 note='Downloading page %s' % (pagenum + 1),
1384 errnote='Unable to download API page')
7cc3570e
PH
1385 data = json.loads(data_json)
1386 api_response = data['data']
1387
1388 if 'items' not in api_response:
07ad22b8 1389 raise ExtractorError(
78caa52a 1390 '[youtube] No video results', expected=True)
b05654f0
PH
1391
1392 new_ids = list(video['id'] for video in api_response['items'])
1393 video_ids += new_ids
1394
1395 limit = min(n, api_response['totalItems'])
1396 pagenum += 1
1397
1398 if len(video_ids) > n:
1399 video_ids = video_ids[:n]
7012b23c
PH
1400 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1401 for video_id in video_ids]
b05654f0 1402 return self.playlist_result(videos, query)
75dff0ee 1403
c9ae7b95 1404
a3dd9248 1405class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1406 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1407 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1408 _SEARCH_KEY = 'ytsearchdate'
78caa52a 1409 IE_DESC = 'YouTube.com searches, newest videos first'
75dff0ee 1410
c9ae7b95
PH
1411
1412class YoutubeSearchURLIE(InfoExtractor):
78caa52a
PH
1413 IE_DESC = 'YouTube.com search URLs'
1414 IE_NAME = 'youtube:search_url'
c9ae7b95 1415 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
1416 _TESTS = [{
1417 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1418 'playlist_mincount': 5,
1419 'info_dict': {
1420 'title': 'youtube-dl test video',
1421 }
1422 }]
c9ae7b95
PH
1423
1424 def _real_extract(self, url):
1425 mobj = re.match(self._VALID_URL, url)
1426 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1427
1428 webpage = self._download_webpage(url, query)
1429 result_code = self._search_regex(
78caa52a 1430 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
c9ae7b95
PH
1431
1432 part_codes = re.findall(
1433 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1434 entries = []
1435 for part_code in part_codes:
1436 part_title = self._html_search_regex(
6feb2d5e 1437 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1438 part_url_snippet = self._html_search_regex(
1439 r'(?s)href="([^"]+)"', part_code, 'item URL')
1440 part_url = compat_urlparse.urljoin(
1441 'https://www.youtube.com/', part_url_snippet)
1442 entries.append({
1443 '_type': 'url',
1444 'url': part_url,
1445 'title': part_title,
1446 })
1447
1448 return {
1449 '_type': 'playlist',
1450 'entries': entries,
1451 'title': query,
1452 }
1453
1454
75dff0ee 1455class YoutubeShowIE(InfoExtractor):
78caa52a 1456 IE_DESC = 'YouTube.com (multi-season) shows'
cdc628a4 1457 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 1458 IE_NAME = 'youtube:show'
cdc628a4
PH
1459 _TESTS = [{
1460 'url': 'http://www.youtube.com/show/airdisasters',
1461 'playlist_mincount': 3,
1462 'info_dict': {
1463 'id': 'airdisasters',
1464 'title': 'Air Disasters',
1465 }
1466 }]
75dff0ee
JMF
1467
1468 def _real_extract(self, url):
1469 mobj = re.match(self._VALID_URL, url)
cdc628a4
PH
1470 playlist_id = mobj.group('id')
1471 webpage = self._download_webpage(
1472 url, playlist_id, 'Downloading show webpage')
75dff0ee
JMF
1473 # There's one playlist for each season of the show
1474 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
cdc628a4
PH
1475 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1476 entries = [
1477 self.url_result(
1478 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1479 for season in m_seasons
1480 ]
1481 title = self._og_search_title(webpage, fatal=False)
1482
1483 return {
1484 '_type': 'playlist',
1485 'id': playlist_id,
1486 'title': title,
1487 'entries': entries,
1488 }
04cc9617
JMF
1489
1490
b2e8bc1b 1491class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1492 """
1493 Base class for extractors that fetch info from
1494 http://www.youtube.com/feed_ajax
1495 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1496 """
b2e8bc1b 1497 _LOGIN_REQUIRED = True
43ba5456
JMF
1498 # use action_load_personal_feed instead of action_load_system_feed
1499 _PERSONAL_FEED = False
04cc9617 1500
d7ae0639
JMF
1501 @property
1502 def _FEED_TEMPLATE(self):
43ba5456
JMF
1503 action = 'action_load_system_feed'
1504 if self._PERSONAL_FEED:
1505 action = 'action_load_personal_feed'
38c2e5b8 1506 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1507
1508 @property
1509 def IE_NAME(self):
78caa52a 1510 return 'youtube:%s' % self._FEED_NAME
04cc9617 1511
81f0259b 1512 def _real_initialize(self):
b2e8bc1b 1513 self._login()
81f0259b 1514
04cc9617
JMF
1515 def _real_extract(self, url):
1516 feed_entries = []
0e44d838
JMF
1517 paging = 0
1518 for i in itertools.count(1):
f6177462 1519 info = self._download_json(self._FEED_TEMPLATE % paging,
8bcc8756
JW
1520 '%s feed' % self._FEED_NAME,
1521 'Downloading page %s' % i)
f6177462 1522 feed_html = info.get('feed_html') or info.get('content_html')
1a9b9649 1523 load_more_widget_html = info.get('load_more_widget_html') or feed_html
43ba5456 1524 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1525 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1526 feed_entries.extend(
1527 self.url_result(video_id, 'Youtube', video_id=video_id)
1528 for video_id in ids)
05ee2b6d
JMF
1529 mobj = re.search(
1530 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1a9b9649 1531 load_more_widget_html)
05ee2b6d 1532 if mobj is None:
04cc9617 1533 break
05ee2b6d 1534 paging = mobj.group('paging')
d7ae0639
JMF
1535 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1536
5f6a1245 1537
d7ae0639 1538class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
f3a34072 1539 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
d7ae0639
JMF
1540 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1541 _FEED_NAME = 'recommended'
78caa52a 1542 _PLAYLIST_TITLE = 'Youtube Recommended videos'
c626a3d9 1543
5f6a1245 1544
43ba5456 1545class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
f3a34072 1546 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
43ba5456
JMF
1547 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1548 _FEED_NAME = 'watch_later'
78caa52a 1549 _PLAYLIST_TITLE = 'Youtube Watch Later'
43ba5456 1550 _PERSONAL_FEED = True
c626a3d9 1551
5f6a1245 1552
f459d170 1553class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
f3a34072 1554 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
78caa52a 1555 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
f459d170
JMF
1556 _FEED_NAME = 'history'
1557 _PERSONAL_FEED = True
78caa52a 1558 _PLAYLIST_TITLE = 'Youtube Watch History'
f459d170 1559
5f6a1245 1560
c626a3d9 1561class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 1562 IE_NAME = 'youtube:favorites'
f3a34072 1563 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
c7a7750d 1564 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1565 _LOGIN_REQUIRED = True
1566
1567 def _real_extract(self, url):
1568 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 1569 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 1570 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1571
1572
1ed5b5c9 1573class YoutubeSubscriptionsIE(YoutubePlaylistIE):
78caa52a
PH
1574 IE_NAME = 'youtube:subscriptions'
1575 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1ed5b5c9 1576 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
81127aa5 1577 _TESTS = []
1ed5b5c9
JMF
1578
1579 def _real_extract(self, url):
78caa52a 1580 title = 'Youtube Subscriptions'
1ed5b5c9
JMF
1581 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1582
1583 # The extraction process is the same as for playlists, but the regex
1584 # for the video ids doesn't contain an index
1585 ids = []
1586 more_widget_html = content_html = page
1587
1588 for page_num in itertools.count(1):
1589 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1590 new_ids = orderedSet(matches)
1591 ids.extend(new_ids)
1592
1593 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1594 if not mobj:
1595 break
1596
1597 more = self._download_json(
1598 'https://youtube.com/%s' % mobj.group('more'), title,
1599 'Downloading page #%s' % page_num,
1600 transform_source=uppercase_escape)
1601 content_html = more['content_html']
1602 more_widget_html = more['load_more_widget_html']
1603
1604 return {
1605 '_type': 'playlist',
1606 'title': title,
1607 'entries': self._ids_to_results(ids),
1608 }
1609
1610
15870e90
PH
1611class YoutubeTruncatedURLIE(InfoExtractor):
1612 IE_NAME = 'youtube:truncated_url'
1613 IE_DESC = False # Do not list
975d35db 1614 _VALID_URL = r'''(?x)
c4808c60
PH
1615 (?:https?://)?[^/]+/watch\?(?:
1616 feature=[a-z_]+|
1617 annotation_id=annotation_[^&]+
1618 )?$|
975d35db
PH
1619 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1620 '''
15870e90 1621
c4808c60
PH
1622 _TESTS = [{
1623 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1624 'only_matching': True,
dc2fc736
PH
1625 }, {
1626 'url': 'http://www.youtube.com/watch?',
1627 'only_matching': True,
c4808c60
PH
1628 }]
1629
15870e90
PH
1630 def _real_extract(self, url):
1631 raise ExtractorError(
78caa52a
PH
1632 'Did you forget to quote the URL? Remember that & is a meta '
1633 'character in most shells, so you want to put the URL in quotes, '
1634 'like youtube-dl '
1635 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1636 ' or simply youtube-dl BaW_jenozKc .',
15870e90 1637 expected=True)