]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[youtube] Try to extract the video_info from the webpage before requesting the 'get_v...
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
42939b61 10import time
e0df6211 11import traceback
c5e8d7af 12
b05654f0 13from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 14from .subtitles import SubtitlesInfoExtractor
2b25cb5d 15from ..jsinterp import JSInterpreter
54256267 16from ..swfinterp import SWFInterpreter
c5e8d7af 17from ..utils import (
edf3e38e 18 compat_chr,
c5e8d7af 19 compat_parse_qs,
c5e8d7af
PH
20 compat_urllib_parse,
21 compat_urllib_request,
7c61bd36 22 compat_urlparse,
c5e8d7af
PH
23 compat_str,
24
25 clean_html,
26 get_element_by_id,
652cdaa2 27 get_element_by_attribute,
c5e8d7af 28 ExtractorError,
dd27fd17 29 int_or_none,
9c44d242 30 OnDemandPagedList,
c5e8d7af
PH
31 unescapeHTML,
32 unified_strdate,
04cc9617 33 orderedSet,
81c2f20b 34 uppercase_escape,
c5e8d7af
PH
35)
36
5f6a1245 37
de7f3446 38class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
39 """Provide base functions for Youtube extractors"""
40 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
83317f69 41 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
b2e8bc1b
JMF
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
45
b2e8bc1b 46 def _set_language(self):
42939b61
JMF
47 self._set_cookie('.youtube.com', 'PREF', 'f1=50000000&hl=en',
48 # YouTube sets the expire time to about two months
49 expire_time=time.time() + 60*24*3600)
b2e8bc1b
JMF
50
51 def _login(self):
83317f69 52 """
53 Attempt to log in to YouTube.
54 True is returned if successful or skipped.
55 False is returned if login failed.
56
57 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
58 """
b2e8bc1b
JMF
59 (username, password) = self._get_login_info()
60 # No authentication to be performed
61 if username is None:
62 if self._LOGIN_REQUIRED:
69ea8ca4 63 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 64 return True
b2e8bc1b 65
7cc3570e
PH
66 login_page = self._download_webpage(
67 self._LOGIN_URL, None,
69ea8ca4
PH
68 note='Downloading login page',
69 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
70 if login_page is False:
71 return
b2e8bc1b 72
795f28f8 73 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78caa52a 74 login_page, 'Login GALX parameter')
c5e8d7af 75
b2e8bc1b
JMF
76 # Log in
77 login_form_strs = {
8bcc8756
JW
78 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
79 'Email': username,
80 'GALX': galx,
81 'Passwd': password,
82
83 'PersistentCookie': 'yes',
84 '_utf8': '霱',
85 'bgresponse': 'js_disabled',
86 'checkConnection': '',
87 'checkedDomains': 'youtube',
88 'dnConn': '',
89 'pstMsg': '0',
90 'rmShown': '1',
91 'secTok': '',
92 'signIn': 'Sign in',
93 'timeStmp': '',
94 'service': 'youtube',
95 'uilel': '3',
96 'hl': 'en_US',
b2e8bc1b 97 }
83317f69 98
b2e8bc1b
JMF
99 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
100 # chokes on unicode
5f6a1245 101 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
b2e8bc1b 102 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
103
104 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
105 login_results = self._download_webpage(
106 req, None,
69ea8ca4 107 note='Logging in', errnote='unable to log in', fatal=False)
7cc3570e
PH
108 if login_results is False:
109 return False
83317f69 110
111 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
69ea8ca4 112 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
83317f69 113
114 # Two-Factor
115 # TODO add SMS and phone call support - these require making a request and then prompting the user
116
117 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
118 tfa_code = self._get_tfa_info()
119
120 if tfa_code is None:
69ea8ca4
PH
121 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
122 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
83317f69 123 return False
124
125 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
126
127 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
128 if match is None:
69ea8ca4 129 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
83317f69 130 secTok = match.group(1)
131 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
132 if match is None:
69ea8ca4 133 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
83317f69 134 timeStmp = match.group(1)
135
136 tfa_form_strs = {
78caa52a
PH
137 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
138 'smsToken': '',
139 'smsUserPin': tfa_code,
140 'smsVerifyPin': 'Verify',
141
142 'PersistentCookie': 'yes',
143 'checkConnection': '',
144 'checkedDomains': 'youtube',
145 'pstMsg': '1',
146 'secTok': secTok,
147 'timeStmp': timeStmp,
148 'service': 'youtube',
149 'hl': 'en_US',
83317f69 150 }
5f6a1245 151 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
83317f69 152 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
153
154 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
155 tfa_results = self._download_webpage(
156 tfa_req, None,
69ea8ca4 157 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
83317f69 158
159 if tfa_results is False:
160 return False
161
162 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
69ea8ca4 163 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
83317f69 164 return False
165 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
69ea8ca4 166 self._downloader.report_warning('unable to log in - did the page structure change?')
83317f69 167 return False
168 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
69ea8ca4 169 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
83317f69 170 return False
171
7cc3570e 172 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
69ea8ca4 173 self._downloader.report_warning('unable to log in: bad username or password')
b2e8bc1b
JMF
174 return False
175 return True
176
b2e8bc1b
JMF
177 def _real_initialize(self):
178 if self._downloader is None:
179 return
42939b61 180 self._set_language()
b2e8bc1b
JMF
181 if not self._login():
182 return
c5e8d7af 183
8377574c 184
de7f3446 185class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
78caa52a 186 IE_DESC = 'YouTube.com'
cb7dfeea 187 _VALID_URL = r"""(?x)^
c5e8d7af 188 (
edb53e2d 189 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 190 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 191 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 192 (?:www\.)?pwnyoutube\.com/|
f7000f3a 193 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
194 tube\.majestyc\.net/|
195 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
196 (?:.*?\#/)? # handle anchor (#/) redirect urls
197 (?: # the various things that can precede the ID:
ac7553d0 198 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 199 |(?: # or the v= param in all its forms
f7000f3a 200 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
201 (?:\?|\#!?) # the params delimiter ? or # or #!
202 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
203 v=
204 )
f4b05232
JMF
205 ))
206 |youtu\.be/ # just youtu.be/xxxx
edb53e2d 207 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 208 )
c5e8d7af 209 )? # all until now is optional -> you can pass the naked ID
8963d9c2 210 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
9291475f 211 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
c5e8d7af
PH
212 (?(1).+)? # if we found the ID, everything can follow
213 $"""
c5e8d7af 214 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
215 _formats = {
216 '5': {'ext': 'flv', 'width': 400, 'height': 240},
217 '6': {'ext': 'flv', 'width': 450, 'height': 270},
218 '13': {'ext': '3gp'},
219 '17': {'ext': '3gp', 'width': 176, 'height': 144},
220 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
221 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
222 '34': {'ext': 'flv', 'width': 640, 'height': 360},
223 '35': {'ext': 'flv', 'width': 854, 'height': 480},
224 '36': {'ext': '3gp', 'width': 320, 'height': 240},
225 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
226 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
227 '43': {'ext': 'webm', 'width': 640, 'height': 360},
228 '44': {'ext': 'webm', 'width': 854, 'height': 480},
229 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
230 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
231
1d043b93 232
86fe61c8 233 # 3d videos
43b81eb9
PH
234 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
235 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
236 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
237 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
238 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
239 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
240 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 241
96fb5605 242 # Apple HTTP Live Streaming
43b81eb9
PH
243 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
244 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
245 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
246 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
247 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
248 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
249 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
250
251 # DASH mp4 video
43b81eb9
PH
252 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
253 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
254 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
255 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
256 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
257 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
258 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
259 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
0d2c1418
PH
260 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
261 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
262 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
836a086c 263
f6f1fc92 264 # Dash mp4 audio
2c62dc26
PH
265 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
266 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
267 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
268
269 # Dash webm
e75cafe9
A
270 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
271 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
272 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
273 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
274 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
275 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
1cc887cb 276 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
e75cafe9
A
277 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
278 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
279 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
280 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
281 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
282 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
283 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 284 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 285 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
a51d3aa0
PH
286 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
287 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
10a404c3 288 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
2c62dc26
PH
289
290 # Dash webm audio
55db73ef 291 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
e75cafe9 292 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d 293
0857baad
PH
294 # Dash webm audio with opus inside
295 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
296 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
297 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
298
ce6b9a2d
PH
299 # RTMP (unnamed)
300 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 301 }
836a086c 302
78caa52a 303 IE_NAME = 'youtube'
2eb88d95
PH
304 _TESTS = [
305 {
4bc3a23e
PH
306 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
307 'info_dict': {
308 'id': 'BaW_jenozKc',
309 'ext': 'mp4',
310 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
311 'uploader': 'Philipp Hagemeister',
312 'uploader_id': 'phihag',
313 'upload_date': '20121002',
314 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
315 'categories': ['Science & Technology'],
3e7c1224
PH
316 'like_count': int,
317 'dislike_count': int,
2eb88d95 318 }
0e853ca4 319 },
0e853ca4 320 {
4bc3a23e
PH
321 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
322 'note': 'Test generic use_cipher_signature video (#897)',
323 'info_dict': {
324 'id': 'UxxajLWwzqY',
325 'ext': 'mp4',
326 'upload_date': '20120506',
327 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
328 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
329 'uploader': 'Icona Pop',
330 'uploader_id': 'IconaPop',
2eb88d95 331 }
c108eb73
JMF
332 },
333 {
4bc3a23e
PH
334 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
335 'note': 'Test VEVO video with age protection (#956)',
336 'info_dict': {
337 'id': '07FYdnEawAQ',
338 'ext': 'mp4',
339 'upload_date': '20130703',
340 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
341 'description': 'md5:64249768eec3bc4276236606ea996373',
342 'uploader': 'justintimberlakeVEVO',
343 'uploader_id': 'justintimberlakeVEVO',
c108eb73
JMF
344 }
345 },
fccd3771 346 {
4bc3a23e
PH
347 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
348 'note': 'Embed-only video (#1746)',
349 'info_dict': {
350 'id': 'yZIXLfi8CZQ',
351 'ext': 'mp4',
352 'upload_date': '20120608',
353 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
354 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
355 'uploader': 'SET India',
356 'uploader_id': 'setindia'
fccd3771
PH
357 }
358 },
dd27fd17 359 {
4bc3a23e
PH
360 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
361 'note': '256k DASH audio (format 141) via DASH manifest',
362 'info_dict': {
363 'id': 'a9LDPn-MO4I',
364 'ext': 'm4a',
365 'upload_date': '20121002',
366 'uploader_id': '8KVIDEO',
367 'description': '',
368 'uploader': '8KVIDEO',
369 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 370 },
4bc3a23e
PH
371 'params': {
372 'youtube_include_dash_manifest': True,
373 'format': '141',
4919603f 374 },
dd27fd17 375 },
3489b7d2
JMF
376 # DASH manifest with encrypted signature
377 {
78caa52a
PH
378 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
379 'info_dict': {
380 'id': 'IB3lcPjvWLA',
381 'ext': 'm4a',
b766eb27
JMF
382 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
383 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
78caa52a
PH
384 'uploader': 'AfrojackVEVO',
385 'uploader_id': 'AfrojackVEVO',
386 'upload_date': '20131011',
3489b7d2 387 },
4bc3a23e 388 'params': {
78caa52a
PH
389 'youtube_include_dash_manifest': True,
390 'format': '141',
3489b7d2
JMF
391 },
392 },
aa79ac0c
PH
393 # Controversy video
394 {
395 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
396 'info_dict': {
397 'id': 'T4XJQO3qol8',
398 'ext': 'mp4',
399 'upload_date': '20100909',
400 'uploader': 'The Amazing Atheist',
401 'uploader_id': 'TheAmazingAtheist',
402 'title': 'Burning Everyone\'s Koran',
403 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
404 }
405 }
2eb88d95
PH
406 ]
407
e0df6211
PH
408 def __init__(self, *args, **kwargs):
409 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 410 self._player_cache = {}
e0df6211 411
c5e8d7af
PH
412 def report_video_info_webpage_download(self, video_id):
413 """Report attempt to download video info webpage."""
69ea8ca4 414 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 415
c5e8d7af
PH
416 def report_information_extraction(self, video_id):
417 """Report attempt to extract video information."""
69ea8ca4 418 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
419
420 def report_unavailable_format(self, video_id, format):
421 """Report extracted video URL."""
69ea8ca4 422 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
423
424 def report_rtmp_download(self):
425 """Indicate the download will use the RTMP protocol."""
69ea8ca4 426 self.to_screen('RTMP download detected')
c5e8d7af 427
60064c53
PH
428 def _signature_cache_id(self, example_sig):
429 """ Return a string representation of a signature """
78caa52a 430 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
431
432 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 433 id_m = re.match(
c081b35c 434 r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
cf010131 435 player_url)
c081b35c
PH
436 if not id_m:
437 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
438 player_type = id_m.group('ext')
439 player_id = id_m.group('id')
440
c4417ddb 441 # Read from filesystem cache
60064c53
PH
442 func_id = '%s_%s_%s' % (
443 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 444 assert os.path.basename(func_id) == func_id
a0e07d31 445
69ea8ca4 446 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 447 if cache_spec is not None:
78caa52a 448 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 449
e0df6211
PH
450 if player_type == 'js':
451 code = self._download_webpage(
452 player_url, video_id,
69ea8ca4
PH
453 note='Downloading %s player %s' % (player_type, player_id),
454 errnote='Download of %s failed' % player_url)
83799698 455 res = self._parse_sig_js(code)
c4417ddb 456 elif player_type == 'swf':
e0df6211
PH
457 urlh = self._request_webpage(
458 player_url, video_id,
69ea8ca4
PH
459 note='Downloading %s player %s' % (player_type, player_id),
460 errnote='Download of %s failed' % player_url)
e0df6211 461 code = urlh.read()
83799698 462 res = self._parse_sig_swf(code)
e0df6211
PH
463 else:
464 assert False, 'Invalid player type %r' % player_type
465
a0e07d31 466 if cache_spec is None:
78caa52a 467 test_string = ''.join(map(compat_chr, range(len(example_sig))))
a0e07d31
PH
468 cache_res = res(test_string)
469 cache_spec = [ord(c) for c in cache_res]
83799698 470
69ea8ca4 471 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
472 return res
473
60064c53 474 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
475 def gen_sig_code(idxs):
476 def _genslice(start, end, step):
78caa52a 477 starts = '' if start == 0 else str(start)
8bcc8756 478 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 479 steps = '' if step == 1 else (':%d' % step)
78caa52a 480 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
481
482 step = None
0ca96d48
PH
483 start = '(Never used)' # Quelch pyflakes warnings - start will be
484 # set as soon as step is set
edf3e38e
PH
485 for i, prev in zip(idxs[1:], idxs[:-1]):
486 if step is not None:
487 if i - prev == step:
488 continue
489 yield _genslice(start, prev, step)
490 step = None
491 continue
492 if i - prev in [-1, 1]:
493 step = i - prev
494 start = prev
495 continue
496 else:
78caa52a 497 yield 's[%d]' % prev
edf3e38e 498 if step is None:
78caa52a 499 yield 's[%d]' % i
edf3e38e
PH
500 else:
501 yield _genslice(start, i, step)
502
78caa52a 503 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 504 cache_res = func(test_string)
edf3e38e 505 cache_spec = [ord(c) for c in cache_res]
78caa52a 506 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
507 signature_id_tuple = '(%s)' % (
508 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 509 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 510 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 511 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 512
e0df6211
PH
513 def _parse_sig_js(self, jscode):
514 funcname = self._search_regex(
894dd868 515 r'\.sig\|\|([a-zA-Z0-9]+)\(', jscode,
8bcc8756 516 'Initial JS player signature function name')
2b25cb5d
PH
517
518 jsi = JSInterpreter(jscode)
519 initial_function = jsi.extract_function(funcname)
e0df6211
PH
520 return lambda s: initial_function([s])
521
522 def _parse_sig_swf(self, file_contents):
54256267 523 swfi = SWFInterpreter(file_contents)
78caa52a 524 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 525 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 526 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
527 return lambda s: initial_function([s])
528
83799698 529 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 530 """Turn the encrypted s field into a working signature"""
6b37f0be 531
c8bf86d5 532 if player_url is None:
69ea8ca4 533 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 534
69ea8ca4 535 if player_url.startswith('//'):
78caa52a 536 player_url = 'https:' + player_url
c8bf86d5 537 try:
62af3a0e 538 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
539 if player_id not in self._player_cache:
540 func = self._extract_signature_function(
60064c53 541 video_id, player_url, s
c8bf86d5
PH
542 )
543 self._player_cache[player_id] = func
544 func = self._player_cache[player_id]
545 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 546 self._print_sig_code(func, s)
c8bf86d5
PH
547 return func(s)
548 except Exception as e:
549 tb = traceback.format_exc()
550 raise ExtractorError(
78caa52a 551 'Signature extraction failed: ' + tb, cause=e)
e0df6211 552
1f343eaa 553 def _get_available_subtitles(self, video_id, webpage):
de7f3446 554 try:
7fad1c63 555 sub_list = self._download_webpage(
38c2e5b8 556 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
557 video_id, note=False)
558 except ExtractorError as err:
69ea8ca4 559 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
de7f3446
JMF
560 return {}
561 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
562
563 sub_lang_list = {}
564 for l in lang_list:
565 lang = l[1]
7e660ac1
LD
566 if lang in sub_lang_list:
567 continue
de7f3446
JMF
568 params = compat_urllib_parse.urlencode({
569 'lang': lang,
570 'v': video_id,
ca715127 571 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
c3197e3e 572 'name': unescapeHTML(l[0]).encode('utf-8'),
de7f3446 573 })
78caa52a 574 url = 'https://www.youtube.com/api/timedtext?' + params
de7f3446
JMF
575 sub_lang_list[lang] = url
576 if not sub_lang_list:
69ea8ca4 577 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
578 return {}
579 return sub_lang_list
580
055e6f36 581 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
582 """We need the webpage for getting the captions url, pass it as an
583 argument to speed up the process."""
ca715127 584 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
69ea8ca4 585 self.to_screen('%s: Looking for automatic captions' % video_id)
de7f3446 586 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
78caa52a 587 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
588 if mobj is None:
589 self._downloader.report_warning(err_msg)
590 return {}
591 player_config = json.loads(mobj.group(1))
592 try:
0792d563
PH
593 args = player_config['args']
594 caption_url = args['ttsurl']
595 timestamp = args['timestamp']
055e6f36
JMF
596 # We get the available subtitles
597 list_params = compat_urllib_parse.urlencode({
598 'type': 'list',
599 'tlangs': 1,
600 'asrs': 1,
de7f3446 601 })
055e6f36 602 list_url = caption_url + '&' + list_params
e26f8712 603 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 604 original_lang_node = caption_list.find('track')
5f6a1245 605 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr':
69ea8ca4 606 self._downloader.report_warning('Video doesn\'t have automatic captions')
e3dc22ca
JMF
607 return {}
608 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
609
610 sub_lang_list = {}
611 for lang_node in caption_list.findall('target'):
612 sub_lang = lang_node.attrib['lang_code']
613 params = compat_urllib_parse.urlencode({
614 'lang': original_lang,
615 'tlang': sub_lang,
616 'fmt': sub_format,
617 'ts': timestamp,
618 'kind': 'asr',
619 })
620 sub_lang_list[sub_lang] = caption_url + '&' + params
621 return sub_lang_list
de7f3446
JMF
622 # An extractor error can be raise by the download process if there are
623 # no automatic captions but there are subtitles
624 except (KeyError, ExtractorError):
625 self._downloader.report_warning(err_msg)
626 return {}
627
97665381
PH
628 @classmethod
629 def extract_id(cls, url):
630 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 631 if mobj is None:
69ea8ca4 632 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
633 video_id = mobj.group(2)
634 return video_id
635
1d043b93
JMF
636 def _extract_from_m3u8(self, manifest_url, video_id):
637 url_map = {}
5f6a1245 638
1d043b93
JMF
639 def _get_urls(_manifest):
640 lines = _manifest.split('\n')
641 urls = filter(lambda l: l and not l.startswith('#'),
8bcc8756 642 lines)
1d043b93 643 return urls
78caa52a 644 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
1d043b93
JMF
645 formats_urls = _get_urls(manifest)
646 for format_url in formats_urls:
890f62e8 647 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
648 url_map[itag] = format_url
649 return url_map
650
1fb07d10
JG
651 def _extract_annotations(self, video_id):
652 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 653 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 654
c5e8d7af 655 def _real_extract(self, url):
7e8c0af0 656 proto = (
78caa52a
PH
657 'http' if self._downloader.params.get('prefer_insecure', False)
658 else 'https')
7e8c0af0 659
c5e8d7af
PH
660 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
661 mobj = re.search(self._NEXT_URL_RE, url)
662 if mobj:
7e8c0af0 663 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
97665381 664 video_id = self.extract_id(url)
c5e8d7af
PH
665
666 # Get video webpage
aa79ac0c 667 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
a1f934b1 668 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
669
670 # Attempt to extract SWF player URL
e0df6211 671 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
672 if mobj is not None:
673 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
674 else:
675 player_url = None
676
677 # Get video info
c108eb73 678 if re.search(r'player-age-gate-content">', video_webpage) is not None:
c108eb73
JMF
679 age_gate = True
680 # We simulate the access to the video from www.youtube.com/v/{video_id}
681 # this can be viewed without login into Youtube
2c57c7fa
JMF
682 data = compat_urllib_parse.urlencode({
683 'video_id': video_id,
684 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934 685 'sts': self._search_regex(
94bd3613 686 r'"sts"\s*:\s*(\d+)', video_webpage, 'sts', default=''),
2c57c7fa 687 })
7e8c0af0 688 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
94bd3613
PH
689 video_info_webpage = self._download_webpage(
690 video_info_url, video_id,
20436c30 691 note='Refetching age-gated info webpage',
94bd3613 692 errnote='unable to download video info webpage')
c5e8d7af 693 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
694 else:
695 age_gate = False
4e62ebe2
JMF
696 try:
697 # Try looking directly into the video webpage
698 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
699 if not mobj:
700 raise ValueError('Could not find ytplayer.config') # caught below
701 json_code = uppercase_escape(mobj.group(1))
702 ytplayer_config = json.loads(json_code)
703 args = ytplayer_config['args']
704 # Convert to the same format returned by compat_parse_qs
705 video_info = dict((k, [v]) for k, v in args.items())
706 if 'url_encoded_fmt_stream_map' not in args:
707 raise ValueError('No stream_map present') # caught below
708 except ValueError:
709 # We fallback to the get_video_info pages (used by the embed page)
710 self.report_video_info_webpage_download(video_id)
711 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
712 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
713 % (video_id, el_type))
714 video_info_webpage = self._download_webpage(video_info_url,
715 video_id, note=False,
716 errnote='unable to download video info webpage')
717 video_info = compat_parse_qs(video_info_webpage)
718 if 'token' in video_info:
719 break
c5e8d7af
PH
720 if 'token' not in video_info:
721 if 'reason' in video_info:
d11271dd 722 raise ExtractorError(
78caa52a 723 'YouTube said: %s' % video_info['reason'][0],
d11271dd 724 expected=True, video_id=video_id)
c5e8d7af 725 else:
d11271dd 726 raise ExtractorError(
78caa52a 727 '"token" parameter not in video info for unknown reason',
d11271dd 728 video_id=video_id)
c5e8d7af 729
1d699755
PH
730 if 'view_count' in video_info:
731 view_count = int(video_info['view_count'][0])
732 else:
733 view_count = None
734
c5e8d7af
PH
735 # Check for "rental" videos
736 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
69ea8ca4 737 raise ExtractorError('"rental" videos not supported')
c5e8d7af
PH
738
739 # Start extracting information
740 self.report_information_extraction(video_id)
741
742 # uploader
743 if 'author' not in video_info:
69ea8ca4 744 raise ExtractorError('Unable to extract uploader name')
c5e8d7af
PH
745 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
746
747 # uploader_id
748 video_uploader_id = None
749 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
750 if mobj is not None:
751 video_uploader_id = mobj.group(1)
752 else:
69ea8ca4 753 self._downloader.report_warning('unable to extract uploader nickname')
c5e8d7af
PH
754
755 # title
a8c6b241 756 if 'title' in video_info:
aa92f063 757 video_title = video_info['title'][0]
a8c6b241 758 else:
69ea8ca4 759 self._downloader.report_warning('Unable to extract video title')
78caa52a 760 video_title = '_'
c5e8d7af
PH
761
762 # thumbnail image
7763b04e
JMF
763 # We try first to get a high quality image:
764 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
765 video_webpage, re.DOTALL)
766 if m_thumb is not None:
767 video_thumbnail = m_thumb.group(1)
768 elif 'thumbnail_url' not in video_info:
69ea8ca4 769 self._downloader.report_warning('unable to extract video thumbnail')
f490e77e 770 video_thumbnail = None
c5e8d7af
PH
771 else: # don't panic if we can't find it
772 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
773
774 # upload date
775 upload_date = None
ad3bc6ac 776 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
beee53de
PH
777 if mobj is None:
778 mobj = re.search(
263bd4ec 779 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
beee53de 780 video_webpage)
c5e8d7af
PH
781 if mobj is not None:
782 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
783 upload_date = unified_strdate(upload_date)
784
55f7bd2d
PH
785 m_cat_container = self._search_regex(
786 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
787 video_webpage, 'categories', fatal=False)
ec8deefc 788 if m_cat_container:
ad3bc6ac 789 category = self._html_search_regex(
01ed5c9b 790 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
791 default=None)
792 video_categories = None if category is None else [category]
793 else:
794 video_categories = None
ec8deefc 795
c5e8d7af
PH
796 # description
797 video_description = get_element_by_id("eow-description", video_webpage)
798 if video_description:
27dcce19
PH
799 video_description = re.sub(r'''(?x)
800 <a\s+
801 (?:[a-zA-Z-]+="[^"]+"\s+)*?
802 title="([^"]+)"\s+
803 (?:[a-zA-Z-]+="[^"]+"\s+)*?
804 class="yt-uix-redirect-link"\s*>
805 [^<]+
806 </a>
807 ''', r'\1', video_description)
c5e8d7af
PH
808 video_description = clean_html(video_description)
809 else:
810 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
811 if fd_mobj:
812 video_description = unescapeHTML(fd_mobj.group(1))
813 else:
78caa52a 814 video_description = ''
c5e8d7af 815
f30a38be 816 def _extract_count(count_name):
46374a56 817 count = self._search_regex(
f30a38be
JMF
818 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
819 video_webpage, count_name, default=None)
336c3a69
JMF
820 if count is not None:
821 return int(count.replace(',', ''))
822 return None
69ea8ca4
PH
823 like_count = _extract_count('like')
824 dislike_count = _extract_count('dislike')
336c3a69 825
c5e8d7af 826 # subtitles
d82134c3 827 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 828
c5e8d7af 829 if self._downloader.params.get('listsubtitles', False):
d665f8d3 830 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
831 return
832
833 if 'length_seconds' not in video_info:
69ea8ca4 834 self._downloader.report_warning('unable to extract video duration')
b466b702 835 video_duration = None
c5e8d7af 836 else:
b466b702 837 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 838
1fb07d10
JG
839 # annotations
840 video_annotations = None
841 if self._downloader.params.get('writeannotations', False):
5f6a1245 842 video_annotations = self._extract_annotations(video_id)
1fb07d10 843
dd27fd17
PH
844 def _map_to_format_list(urlmap):
845 formats = []
846 for itag, video_real_url in urlmap.items():
847 dct = {
848 'format_id': itag,
849 'url': video_real_url,
850 'player_url': player_url,
851 }
0b65e5d4
PH
852 if itag in self._formats:
853 dct.update(self._formats[itag])
dd27fd17
PH
854 formats.append(dct)
855 return formats
856
c5e8d7af
PH
857 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
858 self.report_rtmp_download()
dd27fd17
PH
859 formats = [{
860 'format_id': '_rtmp',
861 'protocol': 'rtmp',
862 'url': video_info['conn'][0],
863 'player_url': player_url,
864 }]
00fe14fc 865 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
5f6a1245 866 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 867 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 868 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 869 url_map = {}
00fe14fc 870 for url_data_str in encoded_url_map.split(','):
c5e8d7af 871 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
872 if 'itag' not in url_data or 'url' not in url_data:
873 continue
874 format_id = url_data['itag'][0]
875 url = url_data['url'][0]
876
877 if 'sig' in url_data:
878 url += '&signature=' + url_data['sig'][0]
879 elif 's' in url_data:
880 encrypted_sig = url_data['s'][0]
881
882 if not age_gate:
883 jsplayer_url_json = self._search_regex(
884 r'"assets":.+?"js":\s*("[^"]+")',
78caa52a 885 video_webpage, 'JS player URL')
201e9eaa
PH
886 player_url = json.loads(jsplayer_url_json)
887 if player_url is None:
888 player_url_json = self._search_regex(
889 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 890 video_webpage, 'age gate player URL')
201e9eaa
PH
891 player_url = json.loads(player_url_json)
892
893 if self._downloader.params.get('verbose'):
cf010131 894 if player_url is None:
201e9eaa
PH
895 player_version = 'unknown'
896 player_desc = 'unknown'
897 else:
898 if player_url.endswith('swf'):
899 player_version = self._search_regex(
900 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 901 'flash player', fatal=False)
201e9eaa 902 player_desc = 'flash player %s' % player_version
cf010131 903 else:
201e9eaa
PH
904 player_version = self._search_regex(
905 r'html5player-([^/]+?)(?:/html5player)?\.js',
906 player_url,
907 'html5 player', fatal=False)
78caa52a 908 player_desc = 'html5 player %s' % player_version
201e9eaa 909
60064c53 910 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 911 self.to_screen('{%s} signature length %s, %s' %
9e1a5b84 912 (format_id, parts_sizes, player_desc))
201e9eaa
PH
913
914 signature = self._decrypt_signature(
915 encrypted_sig, video_id, player_url, age_gate)
916 url += '&signature=' + signature
917 if 'ratebypass' not in url:
918 url += '&ratebypass=yes'
919 url_map[format_id] = url
dd27fd17 920 formats = _map_to_format_list(url_map)
1d043b93
JMF
921 elif video_info.get('hlsvp'):
922 manifest_url = video_info['hlsvp'][0]
923 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 924 formats = _map_to_format_list(url_map)
c5e8d7af 925 else:
69ea8ca4 926 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 927
dd27fd17 928 # Look for the DASH manifest
203fb43f 929 if self._downloader.params.get('youtube_include_dash_manifest', True):
dd27fd17 930 try:
d68f0cdb 931 # The DASH manifest used needs to be the one from the original video_webpage.
932 # The one found in get_video_info seems to be using different signatures.
933 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
934 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
935 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
936 if age_gate:
3489b7d2 937 dash_manifest_url = video_info.get('dashmpd')[0]
d68f0cdb 938 else:
3489b7d2 939 dash_manifest_url = ytplayer_config['args']['dashmpd']
5f6a1245 940
d68f0cdb 941 def decrypt_sig(mobj):
942 s = mobj.group(1)
943 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
944 return '/signature/%s' % dec_s
945 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
dd27fd17 946 dash_doc = self._download_xml(
d68f0cdb 947 dash_manifest_url, video_id,
69ea8ca4
PH
948 note='Downloading DASH manifest',
949 errnote='Could not download DASH manifest')
950 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
dd27fd17
PH
951 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
952 if url_el is None:
953 continue
954 format_id = r.attrib['id']
955 video_url = url_el.text
956 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
957 f = {
958 'format_id': format_id,
959 'url': video_url,
960 'width': int_or_none(r.attrib.get('width')),
961 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
962 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
963 'filesize': filesize,
964 }
965 try:
966 existing_format = next(
967 fo for fo in formats
968 if fo['format_id'] == format_id)
969 except StopIteration:
970 f.update(self._formats.get(format_id, {}))
971 formats.append(f)
972 else:
973 existing_format.update(f)
974
975 except (ExtractorError, KeyError) as e:
23ad44b5 976 self.report_warning('Skipping DASH manifest: %r' % e, video_id)
d80044c2 977
4bcc7bd1 978 self._sort_formats(formats)
4ea3be0a 979
980 return {
8bcc8756
JW
981 'id': video_id,
982 'uploader': video_uploader,
983 'uploader_id': video_uploader_id,
984 'upload_date': upload_date,
985 'title': video_title,
986 'thumbnail': video_thumbnail,
987 'description': video_description,
988 'categories': video_categories,
989 'subtitles': video_subtitles,
990 'duration': video_duration,
991 'age_limit': 18 if age_gate else 0,
992 'annotations': video_annotations,
7e8c0af0 993 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 994 'view_count': view_count,
4ea3be0a 995 'like_count': like_count,
996 'dislike_count': dislike_count,
8bcc8756 997 'formats': formats,
4ea3be0a 998 }
c5e8d7af 999
5f6a1245 1000
880e1c52 1001class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
78caa52a 1002 IE_DESC = 'YouTube.com playlists'
d67cc9fa 1003 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1004 (?:https?://)?
1005 (?:\w+\.)?
1006 youtube\.com/
1007 (?:
ac7553d0 1008 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
c5e8d7af
PH
1009 \? (?:.*?&)*? (?:p|a|list)=
1010 | p/
1011 )
d67cc9fa 1012 (
7d568f5a 1013 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
5f6a1245 1014 # Top tracks, they can also include dots
d67cc9fa
JMF
1015 |(?:MC)[\w\.]*
1016 )
c5e8d7af
PH
1017 .*
1018 |
7d568f5a 1019 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1020 )"""
dbb94fb0 1021 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dcbb4580 1022 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
dbb94fb0 1023 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
78caa52a 1024 IE_NAME = 'youtube:playlist'
81127aa5
PH
1025 _TESTS = [{
1026 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1027 'info_dict': {
1028 'title': 'ytdl test PL',
a1cf99d0 1029 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
81127aa5
PH
1030 },
1031 'playlist_count': 3,
9291475f
PH
1032 }, {
1033 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1034 'info_dict': {
1035 'title': 'YDL_Empty_List',
1036 },
1037 'playlist_count': 0,
1038 }, {
1039 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1040 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1041 'info_dict': {
1042 'title': '29C3: Not my department',
1043 },
1044 'playlist_count': 95,
1045 }, {
1046 'note': 'issue #673',
1047 'url': 'PLBB231211A4F62143',
1048 'info_dict': {
f46a8702 1049 'title': '[OLD]Team Fortress 2 (Class-based LP)',
9291475f
PH
1050 },
1051 'playlist_mincount': 26,
1052 }, {
1053 'note': 'Large playlist',
1054 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1055 'info_dict': {
1056 'title': 'Uploads from Cauchemar',
1057 },
1058 'playlist_mincount': 799,
1059 }, {
1060 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1061 'info_dict': {
1062 'title': 'YDL_safe_search',
1063 },
1064 'playlist_count': 2,
ac7553d0
PH
1065 }, {
1066 'note': 'embedded',
1067 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1068 'playlist_count': 4,
1069 'info_dict': {
1070 'title': 'JODA15',
1071 }
6b08cdf6
PH
1072 }, {
1073 'note': 'Embedded SWF player',
1074 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1075 'playlist_count': 4,
1076 'info_dict': {
1077 'title': 'JODA7',
1078 }
81127aa5 1079 }]
c5e8d7af 1080
880e1c52
JMF
1081 def _real_initialize(self):
1082 self._login()
1083
652cdaa2 1084 def _ids_to_results(self, ids):
c9cc0bf5
PH
1085 return [
1086 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1087 for vid_id in ids]
652cdaa2
JMF
1088
1089 def _extract_mix(self, playlist_id):
1090 # The mixes are generated from a a single video
1091 # the id of the playlist is just 'RD' + video_id
7d4afc55 1092 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
c9cc0bf5 1093 webpage = self._download_webpage(
78caa52a 1094 url, playlist_id, 'Downloading Youtube mix')
bc2f773b 1095 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
1096 title_span = (
1097 search_title('playlist-title') or
1098 search_title('title long-title') or
1099 search_title('title'))
76d1700b 1100 title = clean_html(title_span)
c9cc0bf5
PH
1101 ids = orderedSet(re.findall(
1102 r'''(?xs)data-video-username=".*?".*?
1103 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1104 webpage))
652cdaa2
JMF
1105 url_results = self._ids_to_results(ids)
1106
1107 return self.playlist_result(url_results, playlist_id, title)
1108
c5e8d7af
PH
1109 def _real_extract(self, url):
1110 # Extract playlist id
d67cc9fa 1111 mobj = re.match(self._VALID_URL, url)
c5e8d7af 1112 if mobj is None:
69ea8ca4 1113 raise ExtractorError('Invalid URL: %s' % url)
47192f92
FV
1114 playlist_id = mobj.group(1) or mobj.group(2)
1115
1116 # Check if it's a video-specific URL
7c61bd36 1117 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1118 if 'v' in query_dict:
1119 video_id = query_dict['v'][0]
1120 if self._downloader.params.get('noplaylist'):
69ea8ca4 1121 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1122 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92 1123 else:
69ea8ca4 1124 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1125
7d4afc55 1126 if playlist_id.startswith('RD'):
652cdaa2
JMF
1127 # Mixes require a custom extraction process
1128 return self._extract_mix(playlist_id)
0a688bc0 1129 if playlist_id.startswith('TL'):
69ea8ca4 1130 raise ExtractorError('For downloading YouTube.com top lists, use '
8bcc8756 1131 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1132
dbb94fb0
S
1133 url = self._TEMPLATE_URL % playlist_id
1134 page = self._download_webpage(url, playlist_id)
1135 more_widget_html = content_html = page
1136
10c0e2d8 1137 # Check if the playlist exists or is private
e399853d 1138 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
10c0e2d8 1139 raise ExtractorError(
78caa52a 1140 'The playlist doesn\'t exist or is private, use --username or '
10c0e2d8
JMF
1141 '--netrc to access it.',
1142 expected=True)
1143
dcbb4580
JMF
1144 # Extract the video ids from the playlist pages
1145 ids = []
c5e8d7af 1146
755eb032 1147 for page_num in itertools.count(1):
dbb94fb0 1148 matches = re.finditer(self._VIDEO_RE, content_html)
6e47b51e
JMF
1149 # We remove the duplicates and the link with index 0
1150 # (it's not the first video of the playlist)
1151 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1152 ids.extend(new_ids)
c5e8d7af 1153
dbb94fb0
S
1154 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1155 if not mobj:
c5e8d7af
PH
1156 break
1157
dbb94fb0 1158 more = self._download_json(
5912c639
PH
1159 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1160 'Downloading page #%s' % page_num,
1161 transform_source=uppercase_escape)
dbb94fb0
S
1162 content_html = more['content_html']
1163 more_widget_html = more['load_more_widget_html']
1164
1165 playlist_title = self._html_search_regex(
68eb8e90 1166 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
78caa52a 1167 page, 'title')
c5e8d7af 1168
652cdaa2 1169 url_results = self._ids_to_results(ids)
dcbb4580 1170 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1171
1172
0a688bc0 1173class YoutubeTopListIE(YoutubePlaylistIE):
78caa52a 1174 IE_NAME = 'youtube:toplist'
69ea8ca4 1175 IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"'
9e1a5b84 1176 ' (Example: "yttoplist:music:Top Tracks")')
0a688bc0 1177 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
cdc628a4
PH
1178 _TESTS = [{
1179 'url': 'yttoplist:music:Trending',
1180 'playlist_mincount': 5,
1181 'skip': 'Only works for logged-in users',
1182 }]
0a688bc0
JMF
1183
1184 def _real_extract(self, url):
1185 mobj = re.match(self._VALID_URL, url)
1186 channel = mobj.group('chann')
1187 title = mobj.group('title')
1188 query = compat_urllib_parse.urlencode({'title': title})
cdc628a4
PH
1189 channel_page = self._download_webpage(
1190 'https://www.youtube.com/%s' % channel, title)
1191 link = self._html_search_regex(
1192 r'''(?x)
1193 <a\s+href="([^"]+)".*?>\s*
1194 <span\s+class="branded-page-module-title-text">\s*
1195 <span[^>]*>.*?%s.*?</span>''' % re.escape(query),
1196 channel_page, 'list')
0a688bc0 1197 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
5f6a1245 1198
0a688bc0
JMF
1199 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1200 ids = []
1201 # sometimes the webpage doesn't contain the videos
1202 # retry until we get them
1203 for i in itertools.count(0):
78caa52a 1204 msg = 'Downloading Youtube mix'
0a688bc0
JMF
1205 if i > 0:
1206 msg += ', retry #%d' % i
c9cc0bf5 1207
0a688bc0
JMF
1208 webpage = self._download_webpage(url, title, msg)
1209 ids = orderedSet(re.findall(video_re, webpage))
1210 if ids:
1211 break
1212 url_results = self._ids_to_results(ids)
1213 return self.playlist_result(url_results, playlist_title=title)
1214
1215
c5e8d7af 1216class YoutubeChannelIE(InfoExtractor):
78caa52a 1217 IE_DESC = 'YouTube.com channels'
c5e8d7af 1218 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1219 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
38c2e5b8 1220 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
78caa52a 1221 IE_NAME = 'youtube:channel'
cdc628a4
PH
1222 _TESTS = [{
1223 'note': 'paginated channel',
1224 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1225 'playlist_mincount': 91,
1226 }]
c5e8d7af
PH
1227
1228 def extract_videos_from_page(self, page):
1229 ids_in_page = []
1230 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1231 if mobj.group(1) not in ids_in_page:
1232 ids_in_page.append(mobj.group(1))
1233 return ids_in_page
1234
1235 def _real_extract(self, url):
1236 # Extract channel id
1237 mobj = re.match(self._VALID_URL, url)
1238 if mobj is None:
69ea8ca4 1239 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
1240
1241 # Download channel page
1242 channel_id = mobj.group(1)
1243 video_ids = []
b9643eed
JMF
1244 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1245 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1246 autogenerated = re.search(r'''(?x)
1247 class="[^"]*?(?:
1248 channel-header-autogenerated-label|
1249 yt-channel-title-autogenerated
1250 )[^"]*"''', channel_page) is not None
c5e8d7af 1251
b9643eed
JMF
1252 if autogenerated:
1253 # The videos are contained in a single page
1254 # the ajax pages can't be used, they are empty
1255 video_ids = self.extract_videos_from_page(channel_page)
1256 else:
1257 # Download all channel pages using the json-based channel_ajax query
1258 for pagenum in itertools.count(1):
1259 url = self._MORE_PAGES_URL % (pagenum, channel_id)
81c2f20b 1260 page = self._download_json(
69ea8ca4 1261 url, channel_id, note='Downloading page #%s' % pagenum,
81c2f20b
PH
1262 transform_source=uppercase_escape)
1263
b9643eed
JMF
1264 ids_in_page = self.extract_videos_from_page(page['content_html'])
1265 video_ids.extend(ids_in_page)
5f6a1245 1266
b9643eed
JMF
1267 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1268 break
c5e8d7af 1269
69ea8ca4 1270 self._downloader.to_screen('[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
c5e8d7af 1271
7012b23c
PH
1272 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1273 for video_id in video_ids]
1274 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1275
1276
1277class YoutubeUserIE(InfoExtractor):
78caa52a 1278 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
c9ae7b95 1279 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
38c2e5b8 1280 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
c5e8d7af 1281 _GDATA_PAGE_SIZE = 50
38c2e5b8 1282 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
78caa52a 1283 IE_NAME = 'youtube:user'
c5e8d7af 1284
cdc628a4
PH
1285 _TESTS = [{
1286 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1287 'playlist_mincount': 320,
1288 'info_dict': {
1289 'title': 'TheLinuxFoundation',
1290 }
1291 }, {
1292 'url': 'ytuser:phihag',
1293 'only_matching': True,
1294 }]
1295
e3ea4790 1296 @classmethod
f4b05232 1297 def suitable(cls, url):
e3ea4790
JMF
1298 # Don't return True if the url can be extracted with other youtube
1299 # extractor, the regex would is too permissive and it would match.
1300 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
5f6a1245
JW
1301 if any(ie.suitable(url) for ie in other_ies):
1302 return False
1303 else:
1304 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 1305
c5e8d7af
PH
1306 def _real_extract(self, url):
1307 # Extract username
1308 mobj = re.match(self._VALID_URL, url)
1309 if mobj is None:
69ea8ca4 1310 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
1311
1312 username = mobj.group(1)
1313
1314 # Download video ids using YouTube Data API. Result size per
1315 # query is limited (currently to 50 videos) so we need to query
1316 # page by page until there are no video ids - it means we got
1317 # all of them.
1318
b7ab0590 1319 def download_page(pagenum):
c5e8d7af
PH
1320 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1321
1322 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1323 page = self._download_webpage(
1324 gdata_url, username,
78caa52a 1325 'Downloading video ids from %d to %d' % (
b7ab0590 1326 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1327
fd9cf738
JMF
1328 try:
1329 response = json.loads(page)
1330 except ValueError as err:
69ea8ca4 1331 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
71c82637 1332 if 'entry' not in response['feed']:
b7ab0590 1333 return
fd9cf738 1334
c5e8d7af 1335 # Extract video identifiers
e302f9ce
PH
1336 entries = response['feed']['entry']
1337 for entry in entries:
1338 title = entry['title']['$t']
1339 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1340 yield {
e302f9ce
PH
1341 '_type': 'url',
1342 'url': video_id,
1343 'ie_key': 'Youtube',
b11cec41 1344 'id': video_id,
e302f9ce 1345 'title': title,
b7ab0590 1346 }
9c44d242 1347 url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1348
7012b23c
PH
1349 return self.playlist_result(url_results, playlist_title=username)
1350
b05654f0
PH
1351
1352class YoutubeSearchIE(SearchInfoExtractor):
78caa52a
PH
1353 IE_DESC = 'YouTube.com searches'
1354 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
b05654f0 1355 _MAX_RESULTS = 1000
78caa52a 1356 IE_NAME = 'youtube:search'
b05654f0
PH
1357 _SEARCH_KEY = 'ytsearch'
1358
b05654f0
PH
1359 def _get_n_results(self, query, n):
1360 """Get a specified number of results for a query"""
1361
1362 video_ids = []
1363 pagenum = 0
1364 limit = n
83d548ef 1365 PAGE_SIZE = 50
b05654f0 1366
83d548ef
PH
1367 while (PAGE_SIZE * pagenum) < limit:
1368 result_url = self._API_URL % (
1369 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1370 (PAGE_SIZE * pagenum) + 1)
7cc3570e 1371 data_json = self._download_webpage(
69ea8ca4
PH
1372 result_url, video_id='query "%s"' % query,
1373 note='Downloading page %s' % (pagenum + 1),
1374 errnote='Unable to download API page')
7cc3570e
PH
1375 data = json.loads(data_json)
1376 api_response = data['data']
1377
1378 if 'items' not in api_response:
07ad22b8 1379 raise ExtractorError(
78caa52a 1380 '[youtube] No video results', expected=True)
b05654f0
PH
1381
1382 new_ids = list(video['id'] for video in api_response['items'])
1383 video_ids += new_ids
1384
1385 limit = min(n, api_response['totalItems'])
1386 pagenum += 1
1387
1388 if len(video_ids) > n:
1389 video_ids = video_ids[:n]
7012b23c
PH
1390 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1391 for video_id in video_ids]
b05654f0 1392 return self.playlist_result(videos, query)
75dff0ee 1393
c9ae7b95 1394
a3dd9248 1395class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1396 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1397 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1398 _SEARCH_KEY = 'ytsearchdate'
78caa52a 1399 IE_DESC = 'YouTube.com searches, newest videos first'
75dff0ee 1400
c9ae7b95
PH
1401
1402class YoutubeSearchURLIE(InfoExtractor):
78caa52a
PH
1403 IE_DESC = 'YouTube.com search URLs'
1404 IE_NAME = 'youtube:search_url'
c9ae7b95 1405 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
1406 _TESTS = [{
1407 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1408 'playlist_mincount': 5,
1409 'info_dict': {
1410 'title': 'youtube-dl test video',
1411 }
1412 }]
c9ae7b95
PH
1413
1414 def _real_extract(self, url):
1415 mobj = re.match(self._VALID_URL, url)
1416 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1417
1418 webpage = self._download_webpage(url, query)
1419 result_code = self._search_regex(
78caa52a 1420 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
c9ae7b95
PH
1421
1422 part_codes = re.findall(
1423 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1424 entries = []
1425 for part_code in part_codes:
1426 part_title = self._html_search_regex(
6feb2d5e 1427 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1428 part_url_snippet = self._html_search_regex(
1429 r'(?s)href="([^"]+)"', part_code, 'item URL')
1430 part_url = compat_urlparse.urljoin(
1431 'https://www.youtube.com/', part_url_snippet)
1432 entries.append({
1433 '_type': 'url',
1434 'url': part_url,
1435 'title': part_title,
1436 })
1437
1438 return {
1439 '_type': 'playlist',
1440 'entries': entries,
1441 'title': query,
1442 }
1443
1444
75dff0ee 1445class YoutubeShowIE(InfoExtractor):
78caa52a 1446 IE_DESC = 'YouTube.com (multi-season) shows'
cdc628a4 1447 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 1448 IE_NAME = 'youtube:show'
cdc628a4
PH
1449 _TESTS = [{
1450 'url': 'http://www.youtube.com/show/airdisasters',
1451 'playlist_mincount': 3,
1452 'info_dict': {
1453 'id': 'airdisasters',
1454 'title': 'Air Disasters',
1455 }
1456 }]
75dff0ee
JMF
1457
1458 def _real_extract(self, url):
1459 mobj = re.match(self._VALID_URL, url)
cdc628a4
PH
1460 playlist_id = mobj.group('id')
1461 webpage = self._download_webpage(
1462 url, playlist_id, 'Downloading show webpage')
75dff0ee
JMF
1463 # There's one playlist for each season of the show
1464 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
cdc628a4
PH
1465 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1466 entries = [
1467 self.url_result(
1468 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1469 for season in m_seasons
1470 ]
1471 title = self._og_search_title(webpage, fatal=False)
1472
1473 return {
1474 '_type': 'playlist',
1475 'id': playlist_id,
1476 'title': title,
1477 'entries': entries,
1478 }
04cc9617
JMF
1479
1480
b2e8bc1b 1481class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1482 """
1483 Base class for extractors that fetch info from
1484 http://www.youtube.com/feed_ajax
1485 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1486 """
b2e8bc1b 1487 _LOGIN_REQUIRED = True
43ba5456
JMF
1488 # use action_load_personal_feed instead of action_load_system_feed
1489 _PERSONAL_FEED = False
04cc9617 1490
d7ae0639
JMF
1491 @property
1492 def _FEED_TEMPLATE(self):
43ba5456
JMF
1493 action = 'action_load_system_feed'
1494 if self._PERSONAL_FEED:
1495 action = 'action_load_personal_feed'
38c2e5b8 1496 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1497
1498 @property
1499 def IE_NAME(self):
78caa52a 1500 return 'youtube:%s' % self._FEED_NAME
04cc9617 1501
81f0259b 1502 def _real_initialize(self):
b2e8bc1b 1503 self._login()
81f0259b 1504
04cc9617
JMF
1505 def _real_extract(self, url):
1506 feed_entries = []
0e44d838
JMF
1507 paging = 0
1508 for i in itertools.count(1):
f6177462 1509 info = self._download_json(self._FEED_TEMPLATE % paging,
8bcc8756
JW
1510 '%s feed' % self._FEED_NAME,
1511 'Downloading page %s' % i)
f6177462 1512 feed_html = info.get('feed_html') or info.get('content_html')
1a9b9649 1513 load_more_widget_html = info.get('load_more_widget_html') or feed_html
43ba5456 1514 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1515 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1516 feed_entries.extend(
1517 self.url_result(video_id, 'Youtube', video_id=video_id)
1518 for video_id in ids)
05ee2b6d
JMF
1519 mobj = re.search(
1520 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1a9b9649 1521 load_more_widget_html)
05ee2b6d 1522 if mobj is None:
04cc9617 1523 break
05ee2b6d 1524 paging = mobj.group('paging')
d7ae0639
JMF
1525 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1526
5f6a1245 1527
d7ae0639 1528class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
f3a34072 1529 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
d7ae0639
JMF
1530 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1531 _FEED_NAME = 'recommended'
78caa52a 1532 _PLAYLIST_TITLE = 'Youtube Recommended videos'
c626a3d9 1533
5f6a1245 1534
43ba5456 1535class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
f3a34072 1536 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
43ba5456
JMF
1537 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1538 _FEED_NAME = 'watch_later'
78caa52a 1539 _PLAYLIST_TITLE = 'Youtube Watch Later'
43ba5456 1540 _PERSONAL_FEED = True
c626a3d9 1541
5f6a1245 1542
f459d170 1543class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
f3a34072 1544 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
78caa52a 1545 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
f459d170
JMF
1546 _FEED_NAME = 'history'
1547 _PERSONAL_FEED = True
78caa52a 1548 _PLAYLIST_TITLE = 'Youtube Watch History'
f459d170 1549
5f6a1245 1550
c626a3d9 1551class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 1552 IE_NAME = 'youtube:favorites'
f3a34072 1553 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
c7a7750d 1554 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1555 _LOGIN_REQUIRED = True
1556
1557 def _real_extract(self, url):
1558 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 1559 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 1560 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1561
1562
1ed5b5c9 1563class YoutubeSubscriptionsIE(YoutubePlaylistIE):
78caa52a
PH
1564 IE_NAME = 'youtube:subscriptions'
1565 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1ed5b5c9 1566 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
81127aa5 1567 _TESTS = []
1ed5b5c9
JMF
1568
1569 def _real_extract(self, url):
78caa52a 1570 title = 'Youtube Subscriptions'
1ed5b5c9
JMF
1571 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1572
1573 # The extraction process is the same as for playlists, but the regex
1574 # for the video ids doesn't contain an index
1575 ids = []
1576 more_widget_html = content_html = page
1577
1578 for page_num in itertools.count(1):
1579 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1580 new_ids = orderedSet(matches)
1581 ids.extend(new_ids)
1582
1583 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1584 if not mobj:
1585 break
1586
1587 more = self._download_json(
1588 'https://youtube.com/%s' % mobj.group('more'), title,
1589 'Downloading page #%s' % page_num,
1590 transform_source=uppercase_escape)
1591 content_html = more['content_html']
1592 more_widget_html = more['load_more_widget_html']
1593
1594 return {
1595 '_type': 'playlist',
1596 'title': title,
1597 'entries': self._ids_to_results(ids),
1598 }
1599
1600
15870e90
PH
1601class YoutubeTruncatedURLIE(InfoExtractor):
1602 IE_NAME = 'youtube:truncated_url'
1603 IE_DESC = False # Do not list
975d35db 1604 _VALID_URL = r'''(?x)
c4808c60
PH
1605 (?:https?://)?[^/]+/watch\?(?:
1606 feature=[a-z_]+|
1607 annotation_id=annotation_[^&]+
1608 )?$|
975d35db
PH
1609 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1610 '''
15870e90 1611
c4808c60
PH
1612 _TESTS = [{
1613 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1614 'only_matching': True,
dc2fc736
PH
1615 }, {
1616 'url': 'http://www.youtube.com/watch?',
1617 'only_matching': True,
c4808c60
PH
1618 }]
1619
15870e90
PH
1620 def _real_extract(self, url):
1621 raise ExtractorError(
78caa52a
PH
1622 'Did you forget to quote the URL? Remember that & is a meta '
1623 'character in most shells, so you want to put the URL in quotes, '
1624 'like youtube-dl '
1625 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1626 ' or simply youtube-dl BaW_jenozKc .',
15870e90 1627 expected=True)