]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[youtube] Better error message for DASH manifest
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
e0df6211 10import traceback
c5e8d7af 11
b05654f0 12from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 13from .subtitles import SubtitlesInfoExtractor
2b25cb5d 14from ..jsinterp import JSInterpreter
54256267 15from ..swfinterp import SWFInterpreter
c5e8d7af 16from ..utils import (
edf3e38e 17 compat_chr,
c5e8d7af 18 compat_parse_qs,
c5e8d7af
PH
19 compat_urllib_parse,
20 compat_urllib_request,
7c61bd36 21 compat_urlparse,
c5e8d7af
PH
22 compat_str,
23
24 clean_html,
25 get_element_by_id,
652cdaa2 26 get_element_by_attribute,
c5e8d7af 27 ExtractorError,
dd27fd17 28 int_or_none,
9c44d242 29 OnDemandPagedList,
c5e8d7af
PH
30 unescapeHTML,
31 unified_strdate,
04cc9617 32 orderedSet,
81c2f20b 33 uppercase_escape,
c5e8d7af
PH
34)
35
de7f3446 36class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
37 """Provide base functions for Youtube extractors"""
38 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
83317f69 39 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
b2e8bc1b 40 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
38c2e5b8 41 _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
b2e8bc1b
JMF
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
45
b2e8bc1b 46 def _set_language(self):
7cc3570e
PH
47 return bool(self._download_webpage(
48 self._LANG_URL, None,
69ea8ca4 49 note='Setting language', errnote='unable to set language',
7cc3570e 50 fatal=False))
b2e8bc1b
JMF
51
52 def _login(self):
83317f69 53 """
54 Attempt to log in to YouTube.
55 True is returned if successful or skipped.
56 False is returned if login failed.
57
58 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
59 """
b2e8bc1b
JMF
60 (username, password) = self._get_login_info()
61 # No authentication to be performed
62 if username is None:
63 if self._LOGIN_REQUIRED:
69ea8ca4 64 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 65 return True
b2e8bc1b 66
7cc3570e
PH
67 login_page = self._download_webpage(
68 self._LOGIN_URL, None,
69ea8ca4
PH
69 note='Downloading login page',
70 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
71 if login_page is False:
72 return
b2e8bc1b 73
795f28f8 74 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78caa52a 75 login_page, 'Login GALX parameter')
c5e8d7af 76
b2e8bc1b
JMF
77 # Log in
78 login_form_strs = {
78caa52a
PH
79 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
80 'Email': username,
81 'GALX': galx,
82 'Passwd': password,
83
84 'PersistentCookie': 'yes',
85 '_utf8': '霱',
86 'bgresponse': 'js_disabled',
87 'checkConnection': '',
88 'checkedDomains': 'youtube',
89 'dnConn': '',
90 'pstMsg': '0',
91 'rmShown': '1',
92 'secTok': '',
93 'signIn': 'Sign in',
94 'timeStmp': '',
95 'service': 'youtube',
96 'uilel': '3',
97 'hl': 'en_US',
b2e8bc1b 98 }
83317f69 99
b2e8bc1b
JMF
100 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
101 # chokes on unicode
102 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
103 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
104
105 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
106 login_results = self._download_webpage(
107 req, None,
69ea8ca4 108 note='Logging in', errnote='unable to log in', fatal=False)
7cc3570e
PH
109 if login_results is False:
110 return False
83317f69 111
112 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
69ea8ca4 113 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
83317f69 114
115 # Two-Factor
116 # TODO add SMS and phone call support - these require making a request and then prompting the user
117
118 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
119 tfa_code = self._get_tfa_info()
120
121 if tfa_code is None:
69ea8ca4
PH
122 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
123 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
83317f69 124 return False
125
126 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
127
128 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
129 if match is None:
69ea8ca4 130 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
83317f69 131 secTok = match.group(1)
132 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
133 if match is None:
69ea8ca4 134 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
83317f69 135 timeStmp = match.group(1)
136
137 tfa_form_strs = {
78caa52a
PH
138 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
139 'smsToken': '',
140 'smsUserPin': tfa_code,
141 'smsVerifyPin': 'Verify',
142
143 'PersistentCookie': 'yes',
144 'checkConnection': '',
145 'checkedDomains': 'youtube',
146 'pstMsg': '1',
147 'secTok': secTok,
148 'timeStmp': timeStmp,
149 'service': 'youtube',
150 'hl': 'en_US',
83317f69 151 }
152 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in tfa_form_strs.items())
153 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
154
155 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
156 tfa_results = self._download_webpage(
157 tfa_req, None,
69ea8ca4 158 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
83317f69 159
160 if tfa_results is False:
161 return False
162
163 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
69ea8ca4 164 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
83317f69 165 return False
166 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
69ea8ca4 167 self._downloader.report_warning('unable to log in - did the page structure change?')
83317f69 168 return False
169 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
69ea8ca4 170 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
83317f69 171 return False
172
7cc3570e 173 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
69ea8ca4 174 self._downloader.report_warning('unable to log in: bad username or password')
b2e8bc1b
JMF
175 return False
176 return True
177
178 def _confirm_age(self):
179 age_form = {
7cc3570e
PH
180 'next_url': '/',
181 'action_confirm': 'Confirm',
182 }
5700e779
JMF
183 req = compat_urllib_request.Request(self._AGE_URL,
184 compat_urllib_parse.urlencode(age_form).encode('ascii'))
7cc3570e
PH
185
186 self._download_webpage(
187 req, None,
bfc2bedc
PH
188 note='Confirming age', errnote='Unable to confirm age',
189 fatal=False)
b2e8bc1b
JMF
190
191 def _real_initialize(self):
192 if self._downloader is None:
193 return
6b445558
PH
194 if self._get_login_info()[0] is not None:
195 if not self._set_language():
196 return
b2e8bc1b
JMF
197 if not self._login():
198 return
199 self._confirm_age()
c5e8d7af 200
8377574c 201
de7f3446 202class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
78caa52a 203 IE_DESC = 'YouTube.com'
cb7dfeea 204 _VALID_URL = r"""(?x)^
c5e8d7af 205 (
edb53e2d 206 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 207 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 208 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 209 (?:www\.)?pwnyoutube\.com/|
f7000f3a 210 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
211 tube\.majestyc\.net/|
212 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
213 (?:.*?\#/)? # handle anchor (#/) redirect urls
214 (?: # the various things that can precede the ID:
ac7553d0 215 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 216 |(?: # or the v= param in all its forms
f7000f3a 217 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
218 (?:\?|\#!?) # the params delimiter ? or # or #!
219 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
220 v=
221 )
f4b05232
JMF
222 ))
223 |youtu\.be/ # just youtu.be/xxxx
edb53e2d 224 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 225 )
c5e8d7af 226 )? # all until now is optional -> you can pass the naked ID
8963d9c2 227 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
9291475f 228 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
c5e8d7af
PH
229 (?(1).+)? # if we found the ID, everything can follow
230 $"""
c5e8d7af 231 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
232 _formats = {
233 '5': {'ext': 'flv', 'width': 400, 'height': 240},
234 '6': {'ext': 'flv', 'width': 450, 'height': 270},
235 '13': {'ext': '3gp'},
236 '17': {'ext': '3gp', 'width': 176, 'height': 144},
237 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
238 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
239 '34': {'ext': 'flv', 'width': 640, 'height': 360},
240 '35': {'ext': 'flv', 'width': 854, 'height': 480},
241 '36': {'ext': '3gp', 'width': 320, 'height': 240},
242 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
243 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
244 '43': {'ext': 'webm', 'width': 640, 'height': 360},
245 '44': {'ext': 'webm', 'width': 854, 'height': 480},
246 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
247 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
248
1d043b93 249
86fe61c8 250 # 3d videos
43b81eb9
PH
251 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
252 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
253 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
254 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
255 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
256 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
257 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 258
96fb5605 259 # Apple HTTP Live Streaming
43b81eb9
PH
260 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
261 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
262 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
263 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
264 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
265 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
266 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
267
268 # DASH mp4 video
43b81eb9
PH
269 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
270 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
271 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
272 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
273 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
274 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
275 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
276 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
0d2c1418
PH
277 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
278 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
279 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
836a086c 280
f6f1fc92 281 # Dash mp4 audio
2c62dc26
PH
282 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
283 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
284 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
285
286 # Dash webm
e75cafe9
A
287 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
288 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
289 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
290 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
291 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
292 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
1cc887cb 293 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
e75cafe9
A
294 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
295 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
296 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
297 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
298 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
299 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
300 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 301 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 302 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
a51d3aa0
PH
303 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
304 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
2c62dc26
PH
305
306 # Dash webm audio
55db73ef 307 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
e75cafe9 308 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d
PH
309
310 # RTMP (unnamed)
311 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 312 }
836a086c 313
78caa52a 314 IE_NAME = 'youtube'
2eb88d95
PH
315 _TESTS = [
316 {
4bc3a23e
PH
317 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
318 'info_dict': {
319 'id': 'BaW_jenozKc',
320 'ext': 'mp4',
321 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
322 'uploader': 'Philipp Hagemeister',
323 'uploader_id': 'phihag',
324 'upload_date': '20121002',
325 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
326 'categories': ['Science & Technology'],
3e7c1224
PH
327 'like_count': int,
328 'dislike_count': int,
2eb88d95 329 }
0e853ca4 330 },
0e853ca4 331 {
4bc3a23e
PH
332 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
333 'note': 'Test generic use_cipher_signature video (#897)',
334 'info_dict': {
335 'id': 'UxxajLWwzqY',
336 'ext': 'mp4',
337 'upload_date': '20120506',
338 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
339 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
340 'uploader': 'Icona Pop',
341 'uploader_id': 'IconaPop',
2eb88d95 342 }
c108eb73
JMF
343 },
344 {
4bc3a23e
PH
345 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
346 'note': 'Test VEVO video with age protection (#956)',
347 'info_dict': {
348 'id': '07FYdnEawAQ',
349 'ext': 'mp4',
350 'upload_date': '20130703',
351 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
352 'description': 'md5:64249768eec3bc4276236606ea996373',
353 'uploader': 'justintimberlakeVEVO',
354 'uploader_id': 'justintimberlakeVEVO',
c108eb73
JMF
355 }
356 },
fccd3771 357 {
4bc3a23e
PH
358 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
359 'note': 'Embed-only video (#1746)',
360 'info_dict': {
361 'id': 'yZIXLfi8CZQ',
362 'ext': 'mp4',
363 'upload_date': '20120608',
364 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
365 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
366 'uploader': 'SET India',
367 'uploader_id': 'setindia'
fccd3771
PH
368 }
369 },
dd27fd17 370 {
4bc3a23e
PH
371 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
372 'note': '256k DASH audio (format 141) via DASH manifest',
373 'info_dict': {
374 'id': 'a9LDPn-MO4I',
375 'ext': 'm4a',
376 'upload_date': '20121002',
377 'uploader_id': '8KVIDEO',
378 'description': '',
379 'uploader': '8KVIDEO',
380 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 381 },
4bc3a23e
PH
382 'params': {
383 'youtube_include_dash_manifest': True,
384 'format': '141',
4919603f 385 },
dd27fd17 386 },
3489b7d2
JMF
387 # DASH manifest with encrypted signature
388 {
78caa52a
PH
389 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
390 'info_dict': {
391 'id': 'IB3lcPjvWLA',
392 'ext': 'm4a',
393 'title': 'Afrojack - The Spark ft. Spree Wilson',
394 'description': 'md5:9717375db5a9a3992be4668bbf3bc0a8',
395 'uploader': 'AfrojackVEVO',
396 'uploader_id': 'AfrojackVEVO',
397 'upload_date': '20131011',
3489b7d2 398 },
4bc3a23e 399 'params': {
78caa52a
PH
400 'youtube_include_dash_manifest': True,
401 'format': '141',
3489b7d2
JMF
402 },
403 },
2eb88d95
PH
404 ]
405
e0df6211
PH
406 def __init__(self, *args, **kwargs):
407 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 408 self._player_cache = {}
e0df6211 409
c5e8d7af
PH
410 def report_video_info_webpage_download(self, video_id):
411 """Report attempt to download video info webpage."""
69ea8ca4 412 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 413
c5e8d7af
PH
414 def report_information_extraction(self, video_id):
415 """Report attempt to extract video information."""
69ea8ca4 416 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
417
418 def report_unavailable_format(self, video_id, format):
419 """Report extracted video URL."""
69ea8ca4 420 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
421
422 def report_rtmp_download(self):
423 """Indicate the download will use the RTMP protocol."""
69ea8ca4 424 self.to_screen('RTMP download detected')
c5e8d7af 425
60064c53
PH
426 def _signature_cache_id(self, example_sig):
427 """ Return a string representation of a signature """
78caa52a 428 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
429
430 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 431 id_m = re.match(
c081b35c 432 r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
cf010131 433 player_url)
c081b35c
PH
434 if not id_m:
435 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
436 player_type = id_m.group('ext')
437 player_id = id_m.group('id')
438
c4417ddb 439 # Read from filesystem cache
60064c53
PH
440 func_id = '%s_%s_%s' % (
441 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 442 assert os.path.basename(func_id) == func_id
a0e07d31 443
69ea8ca4 444 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 445 if cache_spec is not None:
78caa52a 446 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 447
e0df6211
PH
448 if player_type == 'js':
449 code = self._download_webpage(
450 player_url, video_id,
69ea8ca4
PH
451 note='Downloading %s player %s' % (player_type, player_id),
452 errnote='Download of %s failed' % player_url)
83799698 453 res = self._parse_sig_js(code)
c4417ddb 454 elif player_type == 'swf':
e0df6211
PH
455 urlh = self._request_webpage(
456 player_url, video_id,
69ea8ca4
PH
457 note='Downloading %s player %s' % (player_type, player_id),
458 errnote='Download of %s failed' % player_url)
e0df6211 459 code = urlh.read()
83799698 460 res = self._parse_sig_swf(code)
e0df6211
PH
461 else:
462 assert False, 'Invalid player type %r' % player_type
463
a0e07d31 464 if cache_spec is None:
78caa52a 465 test_string = ''.join(map(compat_chr, range(len(example_sig))))
a0e07d31
PH
466 cache_res = res(test_string)
467 cache_spec = [ord(c) for c in cache_res]
83799698 468
69ea8ca4 469 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
470 return res
471
60064c53 472 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
473 def gen_sig_code(idxs):
474 def _genslice(start, end, step):
78caa52a 475 starts = '' if start == 0 else str(start)
69ea8ca4
PH
476 ends = (':%d' % (end+step)) if end + step >= 0 else ':'
477 steps = '' if step == 1 else (':%d' % step)
78caa52a 478 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
479
480 step = None
0ca96d48
PH
481 start = '(Never used)' # Quelch pyflakes warnings - start will be
482 # set as soon as step is set
edf3e38e
PH
483 for i, prev in zip(idxs[1:], idxs[:-1]):
484 if step is not None:
485 if i - prev == step:
486 continue
487 yield _genslice(start, prev, step)
488 step = None
489 continue
490 if i - prev in [-1, 1]:
491 step = i - prev
492 start = prev
493 continue
494 else:
78caa52a 495 yield 's[%d]' % prev
edf3e38e 496 if step is None:
78caa52a 497 yield 's[%d]' % i
edf3e38e
PH
498 else:
499 yield _genslice(start, i, step)
500
78caa52a 501 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 502 cache_res = func(test_string)
edf3e38e 503 cache_spec = [ord(c) for c in cache_res]
78caa52a 504 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
505 signature_id_tuple = '(%s)' % (
506 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 507 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 508 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 509 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 510
e0df6211
PH
511 def _parse_sig_js(self, jscode):
512 funcname = self._search_regex(
894dd868 513 r'\.sig\|\|([a-zA-Z0-9]+)\(', jscode,
78caa52a 514 'Initial JS player signature function name')
2b25cb5d
PH
515
516 jsi = JSInterpreter(jscode)
517 initial_function = jsi.extract_function(funcname)
e0df6211
PH
518 return lambda s: initial_function([s])
519
520 def _parse_sig_swf(self, file_contents):
54256267 521 swfi = SWFInterpreter(file_contents)
78caa52a 522 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 523 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 524 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
525 return lambda s: initial_function([s])
526
83799698 527 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 528 """Turn the encrypted s field into a working signature"""
6b37f0be 529
c8bf86d5 530 if player_url is None:
69ea8ca4 531 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 532
69ea8ca4 533 if player_url.startswith('//'):
78caa52a 534 player_url = 'https:' + player_url
c8bf86d5 535 try:
62af3a0e 536 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
537 if player_id not in self._player_cache:
538 func = self._extract_signature_function(
60064c53 539 video_id, player_url, s
c8bf86d5
PH
540 )
541 self._player_cache[player_id] = func
542 func = self._player_cache[player_id]
543 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 544 self._print_sig_code(func, s)
c8bf86d5
PH
545 return func(s)
546 except Exception as e:
547 tb = traceback.format_exc()
548 raise ExtractorError(
78caa52a 549 'Signature extraction failed: ' + tb, cause=e)
e0df6211 550
1f343eaa 551 def _get_available_subtitles(self, video_id, webpage):
de7f3446 552 try:
7fad1c63 553 sub_list = self._download_webpage(
38c2e5b8 554 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
555 video_id, note=False)
556 except ExtractorError as err:
69ea8ca4 557 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
de7f3446
JMF
558 return {}
559 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
560
561 sub_lang_list = {}
562 for l in lang_list:
563 lang = l[1]
7e660ac1
LD
564 if lang in sub_lang_list:
565 continue
de7f3446
JMF
566 params = compat_urllib_parse.urlencode({
567 'lang': lang,
568 'v': video_id,
ca715127 569 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
c3197e3e 570 'name': unescapeHTML(l[0]).encode('utf-8'),
de7f3446 571 })
78caa52a 572 url = 'https://www.youtube.com/api/timedtext?' + params
de7f3446
JMF
573 sub_lang_list[lang] = url
574 if not sub_lang_list:
69ea8ca4 575 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
576 return {}
577 return sub_lang_list
578
055e6f36 579 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
580 """We need the webpage for getting the captions url, pass it as an
581 argument to speed up the process."""
ca715127 582 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
69ea8ca4 583 self.to_screen('%s: Looking for automatic captions' % video_id)
de7f3446 584 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
78caa52a 585 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
586 if mobj is None:
587 self._downloader.report_warning(err_msg)
588 return {}
589 player_config = json.loads(mobj.group(1))
590 try:
591 args = player_config[u'args']
592 caption_url = args[u'ttsurl']
593 timestamp = args[u'timestamp']
055e6f36
JMF
594 # We get the available subtitles
595 list_params = compat_urllib_parse.urlencode({
596 'type': 'list',
597 'tlangs': 1,
598 'asrs': 1,
de7f3446 599 })
055e6f36 600 list_url = caption_url + '&' + list_params
e26f8712 601 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 602 original_lang_node = caption_list.find('track')
f6a54188 603 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
69ea8ca4 604 self._downloader.report_warning('Video doesn\'t have automatic captions')
e3dc22ca
JMF
605 return {}
606 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
607
608 sub_lang_list = {}
609 for lang_node in caption_list.findall('target'):
610 sub_lang = lang_node.attrib['lang_code']
611 params = compat_urllib_parse.urlencode({
612 'lang': original_lang,
613 'tlang': sub_lang,
614 'fmt': sub_format,
615 'ts': timestamp,
616 'kind': 'asr',
617 })
618 sub_lang_list[sub_lang] = caption_url + '&' + params
619 return sub_lang_list
de7f3446
JMF
620 # An extractor error can be raise by the download process if there are
621 # no automatic captions but there are subtitles
622 except (KeyError, ExtractorError):
623 self._downloader.report_warning(err_msg)
624 return {}
625
97665381
PH
626 @classmethod
627 def extract_id(cls, url):
628 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 629 if mobj is None:
69ea8ca4 630 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
631 video_id = mobj.group(2)
632 return video_id
633
1d043b93
JMF
634 def _extract_from_m3u8(self, manifest_url, video_id):
635 url_map = {}
636 def _get_urls(_manifest):
637 lines = _manifest.split('\n')
638 urls = filter(lambda l: l and not l.startswith('#'),
639 lines)
640 return urls
78caa52a 641 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
1d043b93
JMF
642 formats_urls = _get_urls(manifest)
643 for format_url in formats_urls:
890f62e8 644 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
645 url_map[itag] = format_url
646 return url_map
647
1fb07d10
JG
648 def _extract_annotations(self, video_id):
649 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 650 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 651
c5e8d7af 652 def _real_extract(self, url):
7e8c0af0 653 proto = (
78caa52a
PH
654 'http' if self._downloader.params.get('prefer_insecure', False)
655 else 'https')
7e8c0af0 656
c5e8d7af
PH
657 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
658 mobj = re.search(self._NEXT_URL_RE, url)
659 if mobj:
7e8c0af0 660 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
97665381 661 video_id = self.extract_id(url)
c5e8d7af
PH
662
663 # Get video webpage
7e8c0af0 664 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
a1f934b1
PH
665 pref_cookies = [
666 c for c in self._downloader.cookiejar
667 if c.domain == '.youtube.com' and c.name == 'PREF']
668 for pc in pref_cookies:
669 if 'hl=' in pc.value:
670 pc.value = re.sub(r'hl=[^&]+', 'hl=en', pc.value)
671 else:
672 if pc.value:
673 pc.value += '&'
674 pc.value += 'hl=en'
675 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
676
677 # Attempt to extract SWF player URL
e0df6211 678 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
679 if mobj is not None:
680 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
681 else:
682 player_url = None
683
684 # Get video info
685 self.report_video_info_webpage_download(video_id)
c108eb73 686 if re.search(r'player-age-gate-content">', video_webpage) is not None:
c108eb73
JMF
687 age_gate = True
688 # We simulate the access to the video from www.youtube.com/v/{video_id}
689 # this can be viewed without login into Youtube
2c57c7fa
JMF
690 data = compat_urllib_parse.urlencode({
691 'video_id': video_id,
692 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934 693 'sts': self._search_regex(
94bd3613 694 r'"sts"\s*:\s*(\d+)', video_webpage, 'sts', default=''),
2c57c7fa 695 })
7e8c0af0 696 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
94bd3613
PH
697 video_info_webpage = self._download_webpage(
698 video_info_url, video_id,
20436c30 699 note='Refetching age-gated info webpage',
94bd3613 700 errnote='unable to download video info webpage')
c5e8d7af 701 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
702 else:
703 age_gate = False
704 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
7e8c0af0 705 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
c108eb73
JMF
706 % (video_id, el_type))
707 video_info_webpage = self._download_webpage(video_info_url, video_id,
708 note=False,
709 errnote='unable to download video info webpage')
710 video_info = compat_parse_qs(video_info_webpage)
711 if 'token' in video_info:
712 break
c5e8d7af
PH
713 if 'token' not in video_info:
714 if 'reason' in video_info:
d11271dd 715 raise ExtractorError(
78caa52a 716 'YouTube said: %s' % video_info['reason'][0],
d11271dd 717 expected=True, video_id=video_id)
c5e8d7af 718 else:
d11271dd 719 raise ExtractorError(
78caa52a 720 '"token" parameter not in video info for unknown reason',
d11271dd 721 video_id=video_id)
c5e8d7af 722
1d699755
PH
723 if 'view_count' in video_info:
724 view_count = int(video_info['view_count'][0])
725 else:
726 view_count = None
727
c5e8d7af
PH
728 # Check for "rental" videos
729 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
69ea8ca4 730 raise ExtractorError('"rental" videos not supported')
c5e8d7af
PH
731
732 # Start extracting information
733 self.report_information_extraction(video_id)
734
735 # uploader
736 if 'author' not in video_info:
69ea8ca4 737 raise ExtractorError('Unable to extract uploader name')
c5e8d7af
PH
738 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
739
740 # uploader_id
741 video_uploader_id = None
742 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
743 if mobj is not None:
744 video_uploader_id = mobj.group(1)
745 else:
69ea8ca4 746 self._downloader.report_warning('unable to extract uploader nickname')
c5e8d7af
PH
747
748 # title
a8c6b241 749 if 'title' in video_info:
aa92f063 750 video_title = video_info['title'][0]
a8c6b241 751 else:
69ea8ca4 752 self._downloader.report_warning('Unable to extract video title')
78caa52a 753 video_title = '_'
c5e8d7af
PH
754
755 # thumbnail image
7763b04e
JMF
756 # We try first to get a high quality image:
757 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
758 video_webpage, re.DOTALL)
759 if m_thumb is not None:
760 video_thumbnail = m_thumb.group(1)
761 elif 'thumbnail_url' not in video_info:
69ea8ca4 762 self._downloader.report_warning('unable to extract video thumbnail')
f490e77e 763 video_thumbnail = None
c5e8d7af
PH
764 else: # don't panic if we can't find it
765 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
766
767 # upload date
768 upload_date = None
ad3bc6ac 769 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
beee53de
PH
770 if mobj is None:
771 mobj = re.search(
263bd4ec 772 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
beee53de 773 video_webpage)
c5e8d7af
PH
774 if mobj is not None:
775 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
776 upload_date = unified_strdate(upload_date)
777
55f7bd2d
PH
778 m_cat_container = self._search_regex(
779 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
780 video_webpage, 'categories', fatal=False)
ec8deefc 781 if m_cat_container:
ad3bc6ac 782 category = self._html_search_regex(
01ed5c9b 783 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
784 default=None)
785 video_categories = None if category is None else [category]
786 else:
787 video_categories = None
ec8deefc 788
c5e8d7af
PH
789 # description
790 video_description = get_element_by_id("eow-description", video_webpage)
791 if video_description:
27dcce19
PH
792 video_description = re.sub(r'''(?x)
793 <a\s+
794 (?:[a-zA-Z-]+="[^"]+"\s+)*?
795 title="([^"]+)"\s+
796 (?:[a-zA-Z-]+="[^"]+"\s+)*?
797 class="yt-uix-redirect-link"\s*>
798 [^<]+
799 </a>
800 ''', r'\1', video_description)
c5e8d7af
PH
801 video_description = clean_html(video_description)
802 else:
803 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
804 if fd_mobj:
805 video_description = unescapeHTML(fd_mobj.group(1))
806 else:
78caa52a 807 video_description = ''
c5e8d7af 808
f30a38be 809 def _extract_count(count_name):
46374a56 810 count = self._search_regex(
f30a38be
JMF
811 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
812 video_webpage, count_name, default=None)
336c3a69
JMF
813 if count is not None:
814 return int(count.replace(',', ''))
815 return None
69ea8ca4
PH
816 like_count = _extract_count('like')
817 dislike_count = _extract_count('dislike')
336c3a69 818
c5e8d7af 819 # subtitles
d82134c3 820 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 821
c5e8d7af 822 if self._downloader.params.get('listsubtitles', False):
d665f8d3 823 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
824 return
825
826 if 'length_seconds' not in video_info:
69ea8ca4 827 self._downloader.report_warning('unable to extract video duration')
b466b702 828 video_duration = None
c5e8d7af 829 else:
b466b702 830 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 831
1fb07d10
JG
832 # annotations
833 video_annotations = None
834 if self._downloader.params.get('writeannotations', False):
835 video_annotations = self._extract_annotations(video_id)
836
c5e8d7af 837 # Decide which formats to download
c5e8d7af 838 try:
ae7ed920 839 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
50be92c1
PH
840 if not mobj:
841 raise ValueError('Could not find vevo ID')
ae7ed920
PH
842 json_code = uppercase_escape(mobj.group(1))
843 ytplayer_config = json.loads(json_code)
3489b7d2 844 args = ytplayer_config['args']
7ce7e394
JMF
845 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
846 # this signatures are encrypted
44d46655 847 if 'url_encoded_fmt_stream_map' not in args:
69ea8ca4 848 raise ValueError('No stream_map present') # caught below
00fe14fc
JMF
849 re_signature = re.compile(r'[&,]s=')
850 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
7ce7e394 851 if m_s is not None:
69ea8ca4 852 self.to_screen('%s: Encrypted signatures detected.' % video_id)
c5e8d7af 853 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
78caa52a 854 m_s = re_signature.search(args.get('adaptive_fmts', ''))
b7a68384 855 if m_s is not None:
00fe14fc
JMF
856 if 'adaptive_fmts' in video_info:
857 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
37b6d5f6 858 else:
00fe14fc 859 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
c5e8d7af
PH
860 except ValueError:
861 pass
862
dd27fd17
PH
863 def _map_to_format_list(urlmap):
864 formats = []
865 for itag, video_real_url in urlmap.items():
866 dct = {
867 'format_id': itag,
868 'url': video_real_url,
869 'player_url': player_url,
870 }
0b65e5d4
PH
871 if itag in self._formats:
872 dct.update(self._formats[itag])
dd27fd17
PH
873 formats.append(dct)
874 return formats
875
c5e8d7af
PH
876 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
877 self.report_rtmp_download()
dd27fd17
PH
878 formats = [{
879 'format_id': '_rtmp',
880 'protocol': 'rtmp',
881 'url': video_info['conn'][0],
882 'player_url': player_url,
883 }]
00fe14fc
JMF
884 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
885 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
886 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 887 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 888 url_map = {}
00fe14fc 889 for url_data_str in encoded_url_map.split(','):
c5e8d7af 890 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
891 if 'itag' not in url_data or 'url' not in url_data:
892 continue
893 format_id = url_data['itag'][0]
894 url = url_data['url'][0]
895
896 if 'sig' in url_data:
897 url += '&signature=' + url_data['sig'][0]
898 elif 's' in url_data:
899 encrypted_sig = url_data['s'][0]
900
901 if not age_gate:
902 jsplayer_url_json = self._search_regex(
903 r'"assets":.+?"js":\s*("[^"]+")',
78caa52a 904 video_webpage, 'JS player URL')
201e9eaa
PH
905 player_url = json.loads(jsplayer_url_json)
906 if player_url is None:
907 player_url_json = self._search_regex(
908 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 909 video_webpage, 'age gate player URL')
201e9eaa
PH
910 player_url = json.loads(player_url_json)
911
912 if self._downloader.params.get('verbose'):
cf010131 913 if player_url is None:
201e9eaa
PH
914 player_version = 'unknown'
915 player_desc = 'unknown'
916 else:
917 if player_url.endswith('swf'):
918 player_version = self._search_regex(
919 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 920 'flash player', fatal=False)
201e9eaa 921 player_desc = 'flash player %s' % player_version
cf010131 922 else:
201e9eaa
PH
923 player_version = self._search_regex(
924 r'html5player-([^/]+?)(?:/html5player)?\.js',
925 player_url,
926 'html5 player', fatal=False)
78caa52a 927 player_desc = 'html5 player %s' % player_version
201e9eaa 928
60064c53 929 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 930 self.to_screen('{%s} signature length %s, %s' %
98eb1c3f 931 (format_id, parts_sizes, player_desc))
201e9eaa
PH
932
933 signature = self._decrypt_signature(
934 encrypted_sig, video_id, player_url, age_gate)
935 url += '&signature=' + signature
936 if 'ratebypass' not in url:
937 url += '&ratebypass=yes'
938 url_map[format_id] = url
dd27fd17 939 formats = _map_to_format_list(url_map)
1d043b93
JMF
940 elif video_info.get('hlsvp'):
941 manifest_url = video_info['hlsvp'][0]
942 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 943 formats = _map_to_format_list(url_map)
c5e8d7af 944 else:
69ea8ca4 945 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 946
dd27fd17 947 # Look for the DASH manifest
203fb43f 948 if self._downloader.params.get('youtube_include_dash_manifest', True):
dd27fd17 949 try:
d68f0cdb 950 # The DASH manifest used needs to be the one from the original video_webpage.
951 # The one found in get_video_info seems to be using different signatures.
952 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
953 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
954 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
955 if age_gate:
3489b7d2 956 dash_manifest_url = video_info.get('dashmpd')[0]
d68f0cdb 957 else:
3489b7d2 958 dash_manifest_url = ytplayer_config['args']['dashmpd']
d68f0cdb 959 def decrypt_sig(mobj):
960 s = mobj.group(1)
961 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
962 return '/signature/%s' % dec_s
963 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
dd27fd17 964 dash_doc = self._download_xml(
d68f0cdb 965 dash_manifest_url, video_id,
69ea8ca4
PH
966 note='Downloading DASH manifest',
967 errnote='Could not download DASH manifest')
968 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
dd27fd17
PH
969 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
970 if url_el is None:
971 continue
972 format_id = r.attrib['id']
973 video_url = url_el.text
974 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
975 f = {
976 'format_id': format_id,
977 'url': video_url,
978 'width': int_or_none(r.attrib.get('width')),
979 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
980 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
981 'filesize': filesize,
982 }
983 try:
984 existing_format = next(
985 fo for fo in formats
986 if fo['format_id'] == format_id)
987 except StopIteration:
988 f.update(self._formats.get(format_id, {}))
989 formats.append(f)
990 else:
991 existing_format.update(f)
992
993 except (ExtractorError, KeyError) as e:
23ad44b5 994 self.report_warning('Skipping DASH manifest: %r' % e, video_id)
d80044c2 995
4bcc7bd1 996 self._sort_formats(formats)
4ea3be0a 997
998 return {
999 'id': video_id,
1000 'uploader': video_uploader,
1001 'uploader_id': video_uploader_id,
1002 'upload_date': upload_date,
1003 'title': video_title,
1004 'thumbnail': video_thumbnail,
1005 'description': video_description,
ec8deefc 1006 'categories': video_categories,
4ea3be0a 1007 'subtitles': video_subtitles,
1008 'duration': video_duration,
1009 'age_limit': 18 if age_gate else 0,
1010 'annotations': video_annotations,
7e8c0af0 1011 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
4ea3be0a 1012 'view_count': view_count,
1013 'like_count': like_count,
1014 'dislike_count': dislike_count,
1015 'formats': formats,
1016 }
c5e8d7af 1017
880e1c52 1018class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
78caa52a 1019 IE_DESC = 'YouTube.com playlists'
d67cc9fa 1020 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1021 (?:https?://)?
1022 (?:\w+\.)?
1023 youtube\.com/
1024 (?:
ac7553d0 1025 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
c5e8d7af
PH
1026 \? (?:.*?&)*? (?:p|a|list)=
1027 | p/
1028 )
d67cc9fa 1029 (
7d568f5a 1030 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
d67cc9fa
JMF
1031 # Top tracks, they can also include dots
1032 |(?:MC)[\w\.]*
1033 )
c5e8d7af
PH
1034 .*
1035 |
7d568f5a 1036 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1037 )"""
dbb94fb0 1038 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dcbb4580 1039 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
dbb94fb0 1040 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
78caa52a 1041 IE_NAME = 'youtube:playlist'
81127aa5
PH
1042 _TESTS = [{
1043 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1044 'info_dict': {
1045 'title': 'ytdl test PL',
a1cf99d0 1046 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
81127aa5
PH
1047 },
1048 'playlist_count': 3,
9291475f
PH
1049 }, {
1050 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1051 'info_dict': {
1052 'title': 'YDL_Empty_List',
1053 },
1054 'playlist_count': 0,
1055 }, {
1056 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1057 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1058 'info_dict': {
1059 'title': '29C3: Not my department',
1060 },
1061 'playlist_count': 95,
1062 }, {
1063 'note': 'issue #673',
1064 'url': 'PLBB231211A4F62143',
1065 'info_dict': {
f46a8702 1066 'title': '[OLD]Team Fortress 2 (Class-based LP)',
9291475f
PH
1067 },
1068 'playlist_mincount': 26,
1069 }, {
1070 'note': 'Large playlist',
1071 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1072 'info_dict': {
1073 'title': 'Uploads from Cauchemar',
1074 },
1075 'playlist_mincount': 799,
1076 }, {
1077 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1078 'info_dict': {
1079 'title': 'YDL_safe_search',
1080 },
1081 'playlist_count': 2,
ac7553d0
PH
1082 }, {
1083 'note': 'embedded',
1084 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1085 'playlist_count': 4,
1086 'info_dict': {
1087 'title': 'JODA15',
1088 }
6b08cdf6
PH
1089 }, {
1090 'note': 'Embedded SWF player',
1091 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1092 'playlist_count': 4,
1093 'info_dict': {
1094 'title': 'JODA7',
1095 }
81127aa5 1096 }]
c5e8d7af 1097
880e1c52
JMF
1098 def _real_initialize(self):
1099 self._login()
1100
652cdaa2 1101 def _ids_to_results(self, ids):
c9cc0bf5
PH
1102 return [
1103 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1104 for vid_id in ids]
652cdaa2
JMF
1105
1106 def _extract_mix(self, playlist_id):
1107 # The mixes are generated from a a single video
1108 # the id of the playlist is just 'RD' + video_id
7d4afc55 1109 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
c9cc0bf5 1110 webpage = self._download_webpage(
78caa52a 1111 url, playlist_id, 'Downloading Youtube mix')
bc2f773b 1112 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
1113 title_span = (
1114 search_title('playlist-title') or
1115 search_title('title long-title') or
1116 search_title('title'))
76d1700b 1117 title = clean_html(title_span)
c9cc0bf5
PH
1118 ids = orderedSet(re.findall(
1119 r'''(?xs)data-video-username=".*?".*?
1120 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1121 webpage))
652cdaa2
JMF
1122 url_results = self._ids_to_results(ids)
1123
1124 return self.playlist_result(url_results, playlist_id, title)
1125
c5e8d7af
PH
1126 def _real_extract(self, url):
1127 # Extract playlist id
d67cc9fa 1128 mobj = re.match(self._VALID_URL, url)
c5e8d7af 1129 if mobj is None:
69ea8ca4 1130 raise ExtractorError('Invalid URL: %s' % url)
47192f92
FV
1131 playlist_id = mobj.group(1) or mobj.group(2)
1132
1133 # Check if it's a video-specific URL
7c61bd36 1134 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1135 if 'v' in query_dict:
1136 video_id = query_dict['v'][0]
1137 if self._downloader.params.get('noplaylist'):
69ea8ca4 1138 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1139 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92 1140 else:
69ea8ca4 1141 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1142
7d4afc55 1143 if playlist_id.startswith('RD'):
652cdaa2
JMF
1144 # Mixes require a custom extraction process
1145 return self._extract_mix(playlist_id)
0a688bc0 1146 if playlist_id.startswith('TL'):
69ea8ca4 1147 raise ExtractorError('For downloading YouTube.com top lists, use '
78caa52a 1148 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1149
dbb94fb0
S
1150 url = self._TEMPLATE_URL % playlist_id
1151 page = self._download_webpage(url, playlist_id)
1152 more_widget_html = content_html = page
1153
10c0e2d8 1154 # Check if the playlist exists or is private
e399853d 1155 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
10c0e2d8 1156 raise ExtractorError(
78caa52a 1157 'The playlist doesn\'t exist or is private, use --username or '
10c0e2d8
JMF
1158 '--netrc to access it.',
1159 expected=True)
1160
dcbb4580
JMF
1161 # Extract the video ids from the playlist pages
1162 ids = []
c5e8d7af 1163
755eb032 1164 for page_num in itertools.count(1):
dbb94fb0 1165 matches = re.finditer(self._VIDEO_RE, content_html)
6e47b51e
JMF
1166 # We remove the duplicates and the link with index 0
1167 # (it's not the first video of the playlist)
1168 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1169 ids.extend(new_ids)
c5e8d7af 1170
dbb94fb0
S
1171 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1172 if not mobj:
c5e8d7af
PH
1173 break
1174
dbb94fb0 1175 more = self._download_json(
5912c639
PH
1176 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1177 'Downloading page #%s' % page_num,
1178 transform_source=uppercase_escape)
dbb94fb0
S
1179 content_html = more['content_html']
1180 more_widget_html = more['load_more_widget_html']
1181
1182 playlist_title = self._html_search_regex(
68eb8e90 1183 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
78caa52a 1184 page, 'title')
c5e8d7af 1185
652cdaa2 1186 url_results = self._ids_to_results(ids)
dcbb4580 1187 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1188
1189
0a688bc0 1190class YoutubeTopListIE(YoutubePlaylistIE):
78caa52a 1191 IE_NAME = 'youtube:toplist'
69ea8ca4 1192 IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"'
78caa52a 1193 ' (Example: "yttoplist:music:Top Tracks")')
0a688bc0 1194 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
cdc628a4
PH
1195 _TESTS = [{
1196 'url': 'yttoplist:music:Trending',
1197 'playlist_mincount': 5,
1198 'skip': 'Only works for logged-in users',
1199 }]
0a688bc0
JMF
1200
1201 def _real_extract(self, url):
1202 mobj = re.match(self._VALID_URL, url)
1203 channel = mobj.group('chann')
1204 title = mobj.group('title')
1205 query = compat_urllib_parse.urlencode({'title': title})
cdc628a4
PH
1206 channel_page = self._download_webpage(
1207 'https://www.youtube.com/%s' % channel, title)
1208 link = self._html_search_regex(
1209 r'''(?x)
1210 <a\s+href="([^"]+)".*?>\s*
1211 <span\s+class="branded-page-module-title-text">\s*
1212 <span[^>]*>.*?%s.*?</span>''' % re.escape(query),
1213 channel_page, 'list')
0a688bc0
JMF
1214 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1215
1216 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1217 ids = []
1218 # sometimes the webpage doesn't contain the videos
1219 # retry until we get them
1220 for i in itertools.count(0):
78caa52a 1221 msg = 'Downloading Youtube mix'
0a688bc0
JMF
1222 if i > 0:
1223 msg += ', retry #%d' % i
c9cc0bf5 1224
0a688bc0
JMF
1225 webpage = self._download_webpage(url, title, msg)
1226 ids = orderedSet(re.findall(video_re, webpage))
1227 if ids:
1228 break
1229 url_results = self._ids_to_results(ids)
1230 return self.playlist_result(url_results, playlist_title=title)
1231
1232
c5e8d7af 1233class YoutubeChannelIE(InfoExtractor):
78caa52a 1234 IE_DESC = 'YouTube.com channels'
c5e8d7af 1235 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1236 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
38c2e5b8 1237 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
78caa52a 1238 IE_NAME = 'youtube:channel'
cdc628a4
PH
1239 _TESTS = [{
1240 'note': 'paginated channel',
1241 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1242 'playlist_mincount': 91,
1243 }]
c5e8d7af
PH
1244
1245 def extract_videos_from_page(self, page):
1246 ids_in_page = []
1247 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1248 if mobj.group(1) not in ids_in_page:
1249 ids_in_page.append(mobj.group(1))
1250 return ids_in_page
1251
1252 def _real_extract(self, url):
1253 # Extract channel id
1254 mobj = re.match(self._VALID_URL, url)
1255 if mobj is None:
69ea8ca4 1256 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
1257
1258 # Download channel page
1259 channel_id = mobj.group(1)
1260 video_ids = []
b9643eed
JMF
1261 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1262 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1263 autogenerated = re.search(r'''(?x)
1264 class="[^"]*?(?:
1265 channel-header-autogenerated-label|
1266 yt-channel-title-autogenerated
1267 )[^"]*"''', channel_page) is not None
c5e8d7af 1268
b9643eed
JMF
1269 if autogenerated:
1270 # The videos are contained in a single page
1271 # the ajax pages can't be used, they are empty
1272 video_ids = self.extract_videos_from_page(channel_page)
1273 else:
1274 # Download all channel pages using the json-based channel_ajax query
1275 for pagenum in itertools.count(1):
1276 url = self._MORE_PAGES_URL % (pagenum, channel_id)
81c2f20b 1277 page = self._download_json(
69ea8ca4 1278 url, channel_id, note='Downloading page #%s' % pagenum,
81c2f20b
PH
1279 transform_source=uppercase_escape)
1280
b9643eed
JMF
1281 ids_in_page = self.extract_videos_from_page(page['content_html'])
1282 video_ids.extend(ids_in_page)
1283
1284 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1285 break
c5e8d7af 1286
69ea8ca4 1287 self._downloader.to_screen('[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
c5e8d7af 1288
7012b23c
PH
1289 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1290 for video_id in video_ids]
1291 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1292
1293
1294class YoutubeUserIE(InfoExtractor):
78caa52a 1295 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
c9ae7b95 1296 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
38c2e5b8 1297 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
c5e8d7af 1298 _GDATA_PAGE_SIZE = 50
38c2e5b8 1299 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
78caa52a 1300 IE_NAME = 'youtube:user'
c5e8d7af 1301
cdc628a4
PH
1302 _TESTS = [{
1303 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1304 'playlist_mincount': 320,
1305 'info_dict': {
1306 'title': 'TheLinuxFoundation',
1307 }
1308 }, {
1309 'url': 'ytuser:phihag',
1310 'only_matching': True,
1311 }]
1312
e3ea4790 1313 @classmethod
f4b05232 1314 def suitable(cls, url):
e3ea4790
JMF
1315 # Don't return True if the url can be extracted with other youtube
1316 # extractor, the regex would is too permissive and it would match.
1317 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1318 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1319 else: return super(YoutubeUserIE, cls).suitable(url)
1320
c5e8d7af
PH
1321 def _real_extract(self, url):
1322 # Extract username
1323 mobj = re.match(self._VALID_URL, url)
1324 if mobj is None:
69ea8ca4 1325 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
1326
1327 username = mobj.group(1)
1328
1329 # Download video ids using YouTube Data API. Result size per
1330 # query is limited (currently to 50 videos) so we need to query
1331 # page by page until there are no video ids - it means we got
1332 # all of them.
1333
b7ab0590 1334 def download_page(pagenum):
c5e8d7af
PH
1335 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1336
1337 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1338 page = self._download_webpage(
1339 gdata_url, username,
78caa52a 1340 'Downloading video ids from %d to %d' % (
b7ab0590 1341 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1342
fd9cf738
JMF
1343 try:
1344 response = json.loads(page)
1345 except ValueError as err:
69ea8ca4 1346 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
71c82637 1347 if 'entry' not in response['feed']:
b7ab0590 1348 return
fd9cf738 1349
c5e8d7af 1350 # Extract video identifiers
e302f9ce
PH
1351 entries = response['feed']['entry']
1352 for entry in entries:
1353 title = entry['title']['$t']
1354 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1355 yield {
e302f9ce
PH
1356 '_type': 'url',
1357 'url': video_id,
1358 'ie_key': 'Youtube',
b11cec41 1359 'id': video_id,
e302f9ce 1360 'title': title,
b7ab0590 1361 }
9c44d242 1362 url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1363
7012b23c
PH
1364 return self.playlist_result(url_results, playlist_title=username)
1365
b05654f0
PH
1366
1367class YoutubeSearchIE(SearchInfoExtractor):
78caa52a
PH
1368 IE_DESC = 'YouTube.com searches'
1369 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
b05654f0 1370 _MAX_RESULTS = 1000
78caa52a 1371 IE_NAME = 'youtube:search'
b05654f0
PH
1372 _SEARCH_KEY = 'ytsearch'
1373
b05654f0
PH
1374 def _get_n_results(self, query, n):
1375 """Get a specified number of results for a query"""
1376
1377 video_ids = []
1378 pagenum = 0
1379 limit = n
83d548ef 1380 PAGE_SIZE = 50
b05654f0 1381
83d548ef
PH
1382 while (PAGE_SIZE * pagenum) < limit:
1383 result_url = self._API_URL % (
1384 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1385 (PAGE_SIZE * pagenum) + 1)
7cc3570e 1386 data_json = self._download_webpage(
69ea8ca4
PH
1387 result_url, video_id='query "%s"' % query,
1388 note='Downloading page %s' % (pagenum + 1),
1389 errnote='Unable to download API page')
7cc3570e
PH
1390 data = json.loads(data_json)
1391 api_response = data['data']
1392
1393 if 'items' not in api_response:
07ad22b8 1394 raise ExtractorError(
78caa52a 1395 '[youtube] No video results', expected=True)
b05654f0
PH
1396
1397 new_ids = list(video['id'] for video in api_response['items'])
1398 video_ids += new_ids
1399
1400 limit = min(n, api_response['totalItems'])
1401 pagenum += 1
1402
1403 if len(video_ids) > n:
1404 video_ids = video_ids[:n]
7012b23c
PH
1405 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1406 for video_id in video_ids]
b05654f0 1407 return self.playlist_result(videos, query)
75dff0ee 1408
c9ae7b95 1409
a3dd9248 1410class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1411 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1412 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1413 _SEARCH_KEY = 'ytsearchdate'
78caa52a 1414 IE_DESC = 'YouTube.com searches, newest videos first'
75dff0ee 1415
c9ae7b95
PH
1416
1417class YoutubeSearchURLIE(InfoExtractor):
78caa52a
PH
1418 IE_DESC = 'YouTube.com search URLs'
1419 IE_NAME = 'youtube:search_url'
c9ae7b95 1420 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
1421 _TESTS = [{
1422 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1423 'playlist_mincount': 5,
1424 'info_dict': {
1425 'title': 'youtube-dl test video',
1426 }
1427 }]
c9ae7b95
PH
1428
1429 def _real_extract(self, url):
1430 mobj = re.match(self._VALID_URL, url)
1431 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1432
1433 webpage = self._download_webpage(url, query)
1434 result_code = self._search_regex(
78caa52a 1435 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
c9ae7b95
PH
1436
1437 part_codes = re.findall(
1438 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1439 entries = []
1440 for part_code in part_codes:
1441 part_title = self._html_search_regex(
6feb2d5e 1442 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1443 part_url_snippet = self._html_search_regex(
1444 r'(?s)href="([^"]+)"', part_code, 'item URL')
1445 part_url = compat_urlparse.urljoin(
1446 'https://www.youtube.com/', part_url_snippet)
1447 entries.append({
1448 '_type': 'url',
1449 'url': part_url,
1450 'title': part_title,
1451 })
1452
1453 return {
1454 '_type': 'playlist',
1455 'entries': entries,
1456 'title': query,
1457 }
1458
1459
75dff0ee 1460class YoutubeShowIE(InfoExtractor):
78caa52a 1461 IE_DESC = 'YouTube.com (multi-season) shows'
cdc628a4 1462 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 1463 IE_NAME = 'youtube:show'
cdc628a4
PH
1464 _TESTS = [{
1465 'url': 'http://www.youtube.com/show/airdisasters',
1466 'playlist_mincount': 3,
1467 'info_dict': {
1468 'id': 'airdisasters',
1469 'title': 'Air Disasters',
1470 }
1471 }]
75dff0ee
JMF
1472
1473 def _real_extract(self, url):
1474 mobj = re.match(self._VALID_URL, url)
cdc628a4
PH
1475 playlist_id = mobj.group('id')
1476 webpage = self._download_webpage(
1477 url, playlist_id, 'Downloading show webpage')
75dff0ee
JMF
1478 # There's one playlist for each season of the show
1479 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
cdc628a4
PH
1480 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1481 entries = [
1482 self.url_result(
1483 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1484 for season in m_seasons
1485 ]
1486 title = self._og_search_title(webpage, fatal=False)
1487
1488 return {
1489 '_type': 'playlist',
1490 'id': playlist_id,
1491 'title': title,
1492 'entries': entries,
1493 }
04cc9617
JMF
1494
1495
b2e8bc1b 1496class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1497 """
1498 Base class for extractors that fetch info from
1499 http://www.youtube.com/feed_ajax
1500 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1501 """
b2e8bc1b 1502 _LOGIN_REQUIRED = True
43ba5456
JMF
1503 # use action_load_personal_feed instead of action_load_system_feed
1504 _PERSONAL_FEED = False
04cc9617 1505
d7ae0639
JMF
1506 @property
1507 def _FEED_TEMPLATE(self):
43ba5456
JMF
1508 action = 'action_load_system_feed'
1509 if self._PERSONAL_FEED:
1510 action = 'action_load_personal_feed'
38c2e5b8 1511 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1512
1513 @property
1514 def IE_NAME(self):
78caa52a 1515 return 'youtube:%s' % self._FEED_NAME
04cc9617 1516
81f0259b 1517 def _real_initialize(self):
b2e8bc1b 1518 self._login()
81f0259b 1519
04cc9617
JMF
1520 def _real_extract(self, url):
1521 feed_entries = []
0e44d838
JMF
1522 paging = 0
1523 for i in itertools.count(1):
f6177462 1524 info = self._download_json(self._FEED_TEMPLATE % paging,
78caa52a
PH
1525 '%s feed' % self._FEED_NAME,
1526 'Downloading page %s' % i)
f6177462 1527 feed_html = info.get('feed_html') or info.get('content_html')
1a9b9649 1528 load_more_widget_html = info.get('load_more_widget_html') or feed_html
43ba5456 1529 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1530 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1531 feed_entries.extend(
1532 self.url_result(video_id, 'Youtube', video_id=video_id)
1533 for video_id in ids)
05ee2b6d
JMF
1534 mobj = re.search(
1535 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1a9b9649 1536 load_more_widget_html)
05ee2b6d 1537 if mobj is None:
04cc9617 1538 break
05ee2b6d 1539 paging = mobj.group('paging')
d7ae0639
JMF
1540 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1541
d7ae0639 1542class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
78caa52a 1543 IE_DESC = 'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
d7ae0639
JMF
1544 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1545 _FEED_NAME = 'recommended'
78caa52a 1546 _PLAYLIST_TITLE = 'Youtube Recommended videos'
c626a3d9 1547
43ba5456 1548class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
78caa52a 1549 IE_DESC = 'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
43ba5456
JMF
1550 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1551 _FEED_NAME = 'watch_later'
78caa52a 1552 _PLAYLIST_TITLE = 'Youtube Watch Later'
43ba5456 1553 _PERSONAL_FEED = True
c626a3d9 1554
f459d170 1555class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
78caa52a
PH
1556 IE_DESC = 'Youtube watch history, "ythistory" keyword (requires authentication)'
1557 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
f459d170
JMF
1558 _FEED_NAME = 'history'
1559 _PERSONAL_FEED = True
78caa52a 1560 _PLAYLIST_TITLE = 'Youtube Watch History'
f459d170 1561
c626a3d9 1562class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a
PH
1563 IE_NAME = 'youtube:favorites'
1564 IE_DESC = 'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1565 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1566 _LOGIN_REQUIRED = True
1567
1568 def _real_extract(self, url):
1569 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 1570 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 1571 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1572
1573
1ed5b5c9 1574class YoutubeSubscriptionsIE(YoutubePlaylistIE):
78caa52a
PH
1575 IE_NAME = 'youtube:subscriptions'
1576 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1ed5b5c9 1577 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
81127aa5 1578 _TESTS = []
1ed5b5c9
JMF
1579
1580 def _real_extract(self, url):
78caa52a 1581 title = 'Youtube Subscriptions'
1ed5b5c9
JMF
1582 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1583
1584 # The extraction process is the same as for playlists, but the regex
1585 # for the video ids doesn't contain an index
1586 ids = []
1587 more_widget_html = content_html = page
1588
1589 for page_num in itertools.count(1):
1590 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1591 new_ids = orderedSet(matches)
1592 ids.extend(new_ids)
1593
1594 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1595 if not mobj:
1596 break
1597
1598 more = self._download_json(
1599 'https://youtube.com/%s' % mobj.group('more'), title,
1600 'Downloading page #%s' % page_num,
1601 transform_source=uppercase_escape)
1602 content_html = more['content_html']
1603 more_widget_html = more['load_more_widget_html']
1604
1605 return {
1606 '_type': 'playlist',
1607 'title': title,
1608 'entries': self._ids_to_results(ids),
1609 }
1610
1611
15870e90
PH
1612class YoutubeTruncatedURLIE(InfoExtractor):
1613 IE_NAME = 'youtube:truncated_url'
1614 IE_DESC = False # Do not list
975d35db 1615 _VALID_URL = r'''(?x)
c4808c60
PH
1616 (?:https?://)?[^/]+/watch\?(?:
1617 feature=[a-z_]+|
1618 annotation_id=annotation_[^&]+
1619 )?$|
975d35db
PH
1620 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1621 '''
15870e90 1622
c4808c60
PH
1623 _TESTS = [{
1624 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1625 'only_matching': True,
dc2fc736
PH
1626 }, {
1627 'url': 'http://www.youtube.com/watch?',
1628 'only_matching': True,
c4808c60
PH
1629 }]
1630
15870e90
PH
1631 def _real_extract(self, url):
1632 raise ExtractorError(
78caa52a
PH
1633 'Did you forget to quote the URL? Remember that & is a meta '
1634 'character in most shells, so you want to put the URL in quotes, '
1635 'like youtube-dl '
1636 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1637 ' or simply youtube-dl BaW_jenozKc .',
15870e90 1638 expected=True)