]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[youtube] Add formats 298, 299 (Fixes #4056)
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
e0df6211 10import traceback
c5e8d7af 11
b05654f0 12from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 13from .subtitles import SubtitlesInfoExtractor
2b25cb5d 14from ..jsinterp import JSInterpreter
54256267 15from ..swfinterp import SWFInterpreter
c5e8d7af 16from ..utils import (
edf3e38e 17 compat_chr,
c5e8d7af 18 compat_parse_qs,
c5e8d7af
PH
19 compat_urllib_parse,
20 compat_urllib_request,
7c61bd36 21 compat_urlparse,
c5e8d7af
PH
22 compat_str,
23
24 clean_html,
25 get_element_by_id,
652cdaa2 26 get_element_by_attribute,
c5e8d7af 27 ExtractorError,
dd27fd17 28 int_or_none,
9c44d242 29 OnDemandPagedList,
c5e8d7af
PH
30 unescapeHTML,
31 unified_strdate,
04cc9617 32 orderedSet,
81c2f20b 33 uppercase_escape,
c5e8d7af
PH
34)
35
de7f3446 36class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
37 """Provide base functions for Youtube extractors"""
38 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
83317f69 39 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
b2e8bc1b 40 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
38c2e5b8 41 _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
b2e8bc1b
JMF
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
45
b2e8bc1b 46 def _set_language(self):
7cc3570e
PH
47 return bool(self._download_webpage(
48 self._LANG_URL, None,
69ea8ca4 49 note='Setting language', errnote='unable to set language',
7cc3570e 50 fatal=False))
b2e8bc1b
JMF
51
52 def _login(self):
83317f69 53 """
54 Attempt to log in to YouTube.
55 True is returned if successful or skipped.
56 False is returned if login failed.
57
58 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
59 """
b2e8bc1b
JMF
60 (username, password) = self._get_login_info()
61 # No authentication to be performed
62 if username is None:
63 if self._LOGIN_REQUIRED:
69ea8ca4 64 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 65 return True
b2e8bc1b 66
7cc3570e
PH
67 login_page = self._download_webpage(
68 self._LOGIN_URL, None,
69ea8ca4
PH
69 note='Downloading login page',
70 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
71 if login_page is False:
72 return
b2e8bc1b 73
795f28f8 74 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78caa52a 75 login_page, 'Login GALX parameter')
c5e8d7af 76
b2e8bc1b
JMF
77 # Log in
78 login_form_strs = {
78caa52a
PH
79 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
80 'Email': username,
81 'GALX': galx,
82 'Passwd': password,
83
84 'PersistentCookie': 'yes',
85 '_utf8': '霱',
86 'bgresponse': 'js_disabled',
87 'checkConnection': '',
88 'checkedDomains': 'youtube',
89 'dnConn': '',
90 'pstMsg': '0',
91 'rmShown': '1',
92 'secTok': '',
93 'signIn': 'Sign in',
94 'timeStmp': '',
95 'service': 'youtube',
96 'uilel': '3',
97 'hl': 'en_US',
b2e8bc1b 98 }
83317f69 99
b2e8bc1b
JMF
100 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
101 # chokes on unicode
102 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
103 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
104
105 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
106 login_results = self._download_webpage(
107 req, None,
69ea8ca4 108 note='Logging in', errnote='unable to log in', fatal=False)
7cc3570e
PH
109 if login_results is False:
110 return False
83317f69 111
112 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
69ea8ca4 113 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
83317f69 114
115 # Two-Factor
116 # TODO add SMS and phone call support - these require making a request and then prompting the user
117
118 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
119 tfa_code = self._get_tfa_info()
120
121 if tfa_code is None:
69ea8ca4
PH
122 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
123 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
83317f69 124 return False
125
126 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
127
128 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
129 if match is None:
69ea8ca4 130 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
83317f69 131 secTok = match.group(1)
132 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
133 if match is None:
69ea8ca4 134 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
83317f69 135 timeStmp = match.group(1)
136
137 tfa_form_strs = {
78caa52a
PH
138 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
139 'smsToken': '',
140 'smsUserPin': tfa_code,
141 'smsVerifyPin': 'Verify',
142
143 'PersistentCookie': 'yes',
144 'checkConnection': '',
145 'checkedDomains': 'youtube',
146 'pstMsg': '1',
147 'secTok': secTok,
148 'timeStmp': timeStmp,
149 'service': 'youtube',
150 'hl': 'en_US',
83317f69 151 }
152 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in tfa_form_strs.items())
153 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
154
155 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
156 tfa_results = self._download_webpage(
157 tfa_req, None,
69ea8ca4 158 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
83317f69 159
160 if tfa_results is False:
161 return False
162
163 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
69ea8ca4 164 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
83317f69 165 return False
166 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
69ea8ca4 167 self._downloader.report_warning('unable to log in - did the page structure change?')
83317f69 168 return False
169 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
69ea8ca4 170 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
83317f69 171 return False
172
7cc3570e 173 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
69ea8ca4 174 self._downloader.report_warning('unable to log in: bad username or password')
b2e8bc1b
JMF
175 return False
176 return True
177
178 def _confirm_age(self):
179 age_form = {
7cc3570e
PH
180 'next_url': '/',
181 'action_confirm': 'Confirm',
182 }
5700e779
JMF
183 req = compat_urllib_request.Request(self._AGE_URL,
184 compat_urllib_parse.urlencode(age_form).encode('ascii'))
7cc3570e
PH
185
186 self._download_webpage(
187 req, None,
bfc2bedc
PH
188 note='Confirming age', errnote='Unable to confirm age',
189 fatal=False)
b2e8bc1b
JMF
190
191 def _real_initialize(self):
192 if self._downloader is None:
193 return
6b445558
PH
194 if self._get_login_info()[0] is not None:
195 if not self._set_language():
196 return
b2e8bc1b
JMF
197 if not self._login():
198 return
199 self._confirm_age()
c5e8d7af 200
8377574c 201
de7f3446 202class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
78caa52a 203 IE_DESC = 'YouTube.com'
cb7dfeea 204 _VALID_URL = r"""(?x)^
c5e8d7af 205 (
edb53e2d 206 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 207 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 208 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 209 (?:www\.)?pwnyoutube\.com/|
f7000f3a 210 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
211 tube\.majestyc\.net/|
212 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
213 (?:.*?\#/)? # handle anchor (#/) redirect urls
214 (?: # the various things that can precede the ID:
ac7553d0 215 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 216 |(?: # or the v= param in all its forms
f7000f3a 217 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
218 (?:\?|\#!?) # the params delimiter ? or # or #!
219 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
220 v=
221 )
f4b05232
JMF
222 ))
223 |youtu\.be/ # just youtu.be/xxxx
edb53e2d 224 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 225 )
c5e8d7af 226 )? # all until now is optional -> you can pass the naked ID
8963d9c2 227 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
9291475f 228 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
c5e8d7af
PH
229 (?(1).+)? # if we found the ID, everything can follow
230 $"""
c5e8d7af 231 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
232 _formats = {
233 '5': {'ext': 'flv', 'width': 400, 'height': 240},
234 '6': {'ext': 'flv', 'width': 450, 'height': 270},
235 '13': {'ext': '3gp'},
236 '17': {'ext': '3gp', 'width': 176, 'height': 144},
237 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
238 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
239 '34': {'ext': 'flv', 'width': 640, 'height': 360},
240 '35': {'ext': 'flv', 'width': 854, 'height': 480},
241 '36': {'ext': '3gp', 'width': 320, 'height': 240},
242 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
243 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
244 '43': {'ext': 'webm', 'width': 640, 'height': 360},
245 '44': {'ext': 'webm', 'width': 854, 'height': 480},
246 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
247 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
248
1d043b93 249
86fe61c8 250 # 3d videos
43b81eb9
PH
251 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
252 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
253 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
254 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
255 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
256 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
257 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 258
96fb5605 259 # Apple HTTP Live Streaming
43b81eb9
PH
260 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
261 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
262 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
263 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
264 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
265 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
266 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
267
268 # DASH mp4 video
43b81eb9
PH
269 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
270 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
271 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
272 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
273 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
274 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
275 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
276 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
836a086c 277
f6f1fc92 278 # Dash mp4 audio
2c62dc26
PH
279 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
280 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
281 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
282
283 # Dash webm
e75cafe9
A
284 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
285 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
286 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
287 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
288 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
289 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
1cc887cb 290 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
e75cafe9
A
291 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
292 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
293 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
294 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
295 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
296 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
297 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 298 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 299 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
2c62dc26
PH
300
301 # Dash webm audio
55db73ef 302 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
e75cafe9 303 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d 304
fbb21cf5
PH
305 # Dash mov
306 '298': {'ext': 'mov', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
307 '299': {'ext': 'mov', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
308
ce6b9a2d
PH
309 # RTMP (unnamed)
310 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 311 }
836a086c 312
78caa52a 313 IE_NAME = 'youtube'
2eb88d95
PH
314 _TESTS = [
315 {
4bc3a23e
PH
316 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
317 'info_dict': {
318 'id': 'BaW_jenozKc',
319 'ext': 'mp4',
320 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
321 'uploader': 'Philipp Hagemeister',
322 'uploader_id': 'phihag',
323 'upload_date': '20121002',
324 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
325 'categories': ['Science & Technology'],
3e7c1224
PH
326 'like_count': int,
327 'dislike_count': int,
2eb88d95 328 }
0e853ca4 329 },
0e853ca4 330 {
4bc3a23e
PH
331 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
332 'note': 'Test generic use_cipher_signature video (#897)',
333 'info_dict': {
334 'id': 'UxxajLWwzqY',
335 'ext': 'mp4',
336 'upload_date': '20120506',
337 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
338 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
339 'uploader': 'Icona Pop',
340 'uploader_id': 'IconaPop',
2eb88d95 341 }
c108eb73
JMF
342 },
343 {
4bc3a23e
PH
344 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
345 'note': 'Test VEVO video with age protection (#956)',
346 'info_dict': {
347 'id': '07FYdnEawAQ',
348 'ext': 'mp4',
349 'upload_date': '20130703',
350 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
351 'description': 'md5:64249768eec3bc4276236606ea996373',
352 'uploader': 'justintimberlakeVEVO',
353 'uploader_id': 'justintimberlakeVEVO',
c108eb73
JMF
354 }
355 },
fccd3771 356 {
4bc3a23e
PH
357 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
358 'note': 'Embed-only video (#1746)',
359 'info_dict': {
360 'id': 'yZIXLfi8CZQ',
361 'ext': 'mp4',
362 'upload_date': '20120608',
363 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
364 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
365 'uploader': 'SET India',
366 'uploader_id': 'setindia'
fccd3771
PH
367 }
368 },
dd27fd17 369 {
4bc3a23e
PH
370 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
371 'note': '256k DASH audio (format 141) via DASH manifest',
372 'info_dict': {
373 'id': 'a9LDPn-MO4I',
374 'ext': 'm4a',
375 'upload_date': '20121002',
376 'uploader_id': '8KVIDEO',
377 'description': '',
378 'uploader': '8KVIDEO',
379 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 380 },
4bc3a23e
PH
381 'params': {
382 'youtube_include_dash_manifest': True,
383 'format': '141',
4919603f 384 },
dd27fd17 385 },
3489b7d2
JMF
386 # DASH manifest with encrypted signature
387 {
78caa52a
PH
388 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
389 'info_dict': {
390 'id': 'IB3lcPjvWLA',
391 'ext': 'm4a',
392 'title': 'Afrojack - The Spark ft. Spree Wilson',
393 'description': 'md5:9717375db5a9a3992be4668bbf3bc0a8',
394 'uploader': 'AfrojackVEVO',
395 'uploader_id': 'AfrojackVEVO',
396 'upload_date': '20131011',
3489b7d2 397 },
4bc3a23e 398 'params': {
78caa52a
PH
399 'youtube_include_dash_manifest': True,
400 'format': '141',
3489b7d2
JMF
401 },
402 },
2eb88d95
PH
403 ]
404
e0df6211
PH
405 def __init__(self, *args, **kwargs):
406 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 407 self._player_cache = {}
e0df6211 408
c5e8d7af
PH
409 def report_video_info_webpage_download(self, video_id):
410 """Report attempt to download video info webpage."""
69ea8ca4 411 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 412
c5e8d7af
PH
413 def report_information_extraction(self, video_id):
414 """Report attempt to extract video information."""
69ea8ca4 415 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
416
417 def report_unavailable_format(self, video_id, format):
418 """Report extracted video URL."""
69ea8ca4 419 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
420
421 def report_rtmp_download(self):
422 """Indicate the download will use the RTMP protocol."""
69ea8ca4 423 self.to_screen('RTMP download detected')
c5e8d7af 424
60064c53
PH
425 def _signature_cache_id(self, example_sig):
426 """ Return a string representation of a signature """
78caa52a 427 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
428
429 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 430 id_m = re.match(
c081b35c 431 r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
cf010131 432 player_url)
c081b35c
PH
433 if not id_m:
434 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
435 player_type = id_m.group('ext')
436 player_id = id_m.group('id')
437
c4417ddb 438 # Read from filesystem cache
60064c53
PH
439 func_id = '%s_%s_%s' % (
440 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 441 assert os.path.basename(func_id) == func_id
a0e07d31 442
69ea8ca4 443 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 444 if cache_spec is not None:
78caa52a 445 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 446
e0df6211
PH
447 if player_type == 'js':
448 code = self._download_webpage(
449 player_url, video_id,
69ea8ca4
PH
450 note='Downloading %s player %s' % (player_type, player_id),
451 errnote='Download of %s failed' % player_url)
83799698 452 res = self._parse_sig_js(code)
c4417ddb 453 elif player_type == 'swf':
e0df6211
PH
454 urlh = self._request_webpage(
455 player_url, video_id,
69ea8ca4
PH
456 note='Downloading %s player %s' % (player_type, player_id),
457 errnote='Download of %s failed' % player_url)
e0df6211 458 code = urlh.read()
83799698 459 res = self._parse_sig_swf(code)
e0df6211
PH
460 else:
461 assert False, 'Invalid player type %r' % player_type
462
a0e07d31 463 if cache_spec is None:
78caa52a 464 test_string = ''.join(map(compat_chr, range(len(example_sig))))
a0e07d31
PH
465 cache_res = res(test_string)
466 cache_spec = [ord(c) for c in cache_res]
83799698 467
69ea8ca4 468 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
469 return res
470
60064c53 471 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
472 def gen_sig_code(idxs):
473 def _genslice(start, end, step):
78caa52a 474 starts = '' if start == 0 else str(start)
69ea8ca4
PH
475 ends = (':%d' % (end+step)) if end + step >= 0 else ':'
476 steps = '' if step == 1 else (':%d' % step)
78caa52a 477 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
478
479 step = None
0ca96d48
PH
480 start = '(Never used)' # Quelch pyflakes warnings - start will be
481 # set as soon as step is set
edf3e38e
PH
482 for i, prev in zip(idxs[1:], idxs[:-1]):
483 if step is not None:
484 if i - prev == step:
485 continue
486 yield _genslice(start, prev, step)
487 step = None
488 continue
489 if i - prev in [-1, 1]:
490 step = i - prev
491 start = prev
492 continue
493 else:
78caa52a 494 yield 's[%d]' % prev
edf3e38e 495 if step is None:
78caa52a 496 yield 's[%d]' % i
edf3e38e
PH
497 else:
498 yield _genslice(start, i, step)
499
78caa52a 500 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 501 cache_res = func(test_string)
edf3e38e 502 cache_spec = [ord(c) for c in cache_res]
78caa52a 503 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
504 signature_id_tuple = '(%s)' % (
505 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 506 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 507 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 508 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 509
e0df6211
PH
510 def _parse_sig_js(self, jscode):
511 funcname = self._search_regex(
c26e9ac4 512 r'signature=([$a-zA-Z]+)', jscode,
78caa52a 513 'Initial JS player signature function name')
2b25cb5d
PH
514
515 jsi = JSInterpreter(jscode)
516 initial_function = jsi.extract_function(funcname)
e0df6211
PH
517 return lambda s: initial_function([s])
518
519 def _parse_sig_swf(self, file_contents):
54256267 520 swfi = SWFInterpreter(file_contents)
78caa52a 521 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 522 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 523 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
524 return lambda s: initial_function([s])
525
83799698 526 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 527 """Turn the encrypted s field into a working signature"""
6b37f0be 528
c8bf86d5 529 if player_url is None:
69ea8ca4 530 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 531
69ea8ca4 532 if player_url.startswith('//'):
78caa52a 533 player_url = 'https:' + player_url
c8bf86d5 534 try:
62af3a0e 535 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
536 if player_id not in self._player_cache:
537 func = self._extract_signature_function(
60064c53 538 video_id, player_url, s
c8bf86d5
PH
539 )
540 self._player_cache[player_id] = func
541 func = self._player_cache[player_id]
542 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 543 self._print_sig_code(func, s)
c8bf86d5
PH
544 return func(s)
545 except Exception as e:
546 tb = traceback.format_exc()
547 raise ExtractorError(
78caa52a 548 'Signature extraction failed: ' + tb, cause=e)
e0df6211 549
1f343eaa 550 def _get_available_subtitles(self, video_id, webpage):
de7f3446 551 try:
7fad1c63 552 sub_list = self._download_webpage(
38c2e5b8 553 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
554 video_id, note=False)
555 except ExtractorError as err:
69ea8ca4 556 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
de7f3446
JMF
557 return {}
558 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
559
560 sub_lang_list = {}
561 for l in lang_list:
562 lang = l[1]
7e660ac1
LD
563 if lang in sub_lang_list:
564 continue
de7f3446
JMF
565 params = compat_urllib_parse.urlencode({
566 'lang': lang,
567 'v': video_id,
ca715127 568 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
c3197e3e 569 'name': unescapeHTML(l[0]).encode('utf-8'),
de7f3446 570 })
78caa52a 571 url = 'https://www.youtube.com/api/timedtext?' + params
de7f3446
JMF
572 sub_lang_list[lang] = url
573 if not sub_lang_list:
69ea8ca4 574 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
575 return {}
576 return sub_lang_list
577
055e6f36 578 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
579 """We need the webpage for getting the captions url, pass it as an
580 argument to speed up the process."""
ca715127 581 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
69ea8ca4 582 self.to_screen('%s: Looking for automatic captions' % video_id)
de7f3446 583 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
78caa52a 584 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
585 if mobj is None:
586 self._downloader.report_warning(err_msg)
587 return {}
588 player_config = json.loads(mobj.group(1))
589 try:
590 args = player_config[u'args']
591 caption_url = args[u'ttsurl']
592 timestamp = args[u'timestamp']
055e6f36
JMF
593 # We get the available subtitles
594 list_params = compat_urllib_parse.urlencode({
595 'type': 'list',
596 'tlangs': 1,
597 'asrs': 1,
de7f3446 598 })
055e6f36 599 list_url = caption_url + '&' + list_params
e26f8712 600 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 601 original_lang_node = caption_list.find('track')
f6a54188 602 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
69ea8ca4 603 self._downloader.report_warning('Video doesn\'t have automatic captions')
e3dc22ca
JMF
604 return {}
605 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
606
607 sub_lang_list = {}
608 for lang_node in caption_list.findall('target'):
609 sub_lang = lang_node.attrib['lang_code']
610 params = compat_urllib_parse.urlencode({
611 'lang': original_lang,
612 'tlang': sub_lang,
613 'fmt': sub_format,
614 'ts': timestamp,
615 'kind': 'asr',
616 })
617 sub_lang_list[sub_lang] = caption_url + '&' + params
618 return sub_lang_list
de7f3446
JMF
619 # An extractor error can be raise by the download process if there are
620 # no automatic captions but there are subtitles
621 except (KeyError, ExtractorError):
622 self._downloader.report_warning(err_msg)
623 return {}
624
97665381
PH
625 @classmethod
626 def extract_id(cls, url):
627 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 628 if mobj is None:
69ea8ca4 629 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
630 video_id = mobj.group(2)
631 return video_id
632
1d043b93
JMF
633 def _extract_from_m3u8(self, manifest_url, video_id):
634 url_map = {}
635 def _get_urls(_manifest):
636 lines = _manifest.split('\n')
637 urls = filter(lambda l: l and not l.startswith('#'),
638 lines)
639 return urls
78caa52a 640 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
1d043b93
JMF
641 formats_urls = _get_urls(manifest)
642 for format_url in formats_urls:
890f62e8 643 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
644 url_map[itag] = format_url
645 return url_map
646
1fb07d10
JG
647 def _extract_annotations(self, video_id):
648 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 649 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 650
c5e8d7af 651 def _real_extract(self, url):
7e8c0af0 652 proto = (
78caa52a
PH
653 'http' if self._downloader.params.get('prefer_insecure', False)
654 else 'https')
7e8c0af0 655
c5e8d7af
PH
656 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
657 mobj = re.search(self._NEXT_URL_RE, url)
658 if mobj:
7e8c0af0 659 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
97665381 660 video_id = self.extract_id(url)
c5e8d7af
PH
661
662 # Get video webpage
7e8c0af0 663 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
a1f934b1
PH
664 pref_cookies = [
665 c for c in self._downloader.cookiejar
666 if c.domain == '.youtube.com' and c.name == 'PREF']
667 for pc in pref_cookies:
668 if 'hl=' in pc.value:
669 pc.value = re.sub(r'hl=[^&]+', 'hl=en', pc.value)
670 else:
671 if pc.value:
672 pc.value += '&'
673 pc.value += 'hl=en'
674 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
675
676 # Attempt to extract SWF player URL
e0df6211 677 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
678 if mobj is not None:
679 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
680 else:
681 player_url = None
682
683 # Get video info
684 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
685 if re.search(r'player-age-gate-content">', video_webpage) is not None:
686 self.report_age_confirmation()
687 age_gate = True
688 # We simulate the access to the video from www.youtube.com/v/{video_id}
689 # this can be viewed without login into Youtube
2c57c7fa
JMF
690 data = compat_urllib_parse.urlencode({
691 'video_id': video_id,
692 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934
JMF
693 'sts': self._search_regex(
694 r'"sts"\s*:\s*(\d+)', video_webpage, 'sts'),
2c57c7fa 695 })
7e8c0af0 696 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
697 video_info_webpage = self._download_webpage(video_info_url, video_id,
698 note=False,
699 errnote='unable to download video info webpage')
700 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
701 else:
702 age_gate = False
703 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
7e8c0af0 704 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
c108eb73
JMF
705 % (video_id, el_type))
706 video_info_webpage = self._download_webpage(video_info_url, video_id,
707 note=False,
708 errnote='unable to download video info webpage')
709 video_info = compat_parse_qs(video_info_webpage)
710 if 'token' in video_info:
711 break
c5e8d7af
PH
712 if 'token' not in video_info:
713 if 'reason' in video_info:
d11271dd 714 raise ExtractorError(
78caa52a 715 'YouTube said: %s' % video_info['reason'][0],
d11271dd 716 expected=True, video_id=video_id)
c5e8d7af 717 else:
d11271dd 718 raise ExtractorError(
78caa52a 719 '"token" parameter not in video info for unknown reason',
d11271dd 720 video_id=video_id)
c5e8d7af 721
1d699755
PH
722 if 'view_count' in video_info:
723 view_count = int(video_info['view_count'][0])
724 else:
725 view_count = None
726
c5e8d7af
PH
727 # Check for "rental" videos
728 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
69ea8ca4 729 raise ExtractorError('"rental" videos not supported')
c5e8d7af
PH
730
731 # Start extracting information
732 self.report_information_extraction(video_id)
733
734 # uploader
735 if 'author' not in video_info:
69ea8ca4 736 raise ExtractorError('Unable to extract uploader name')
c5e8d7af
PH
737 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
738
739 # uploader_id
740 video_uploader_id = None
741 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
742 if mobj is not None:
743 video_uploader_id = mobj.group(1)
744 else:
69ea8ca4 745 self._downloader.report_warning('unable to extract uploader nickname')
c5e8d7af
PH
746
747 # title
a8c6b241 748 if 'title' in video_info:
aa92f063 749 video_title = video_info['title'][0]
a8c6b241 750 else:
69ea8ca4 751 self._downloader.report_warning('Unable to extract video title')
78caa52a 752 video_title = '_'
c5e8d7af
PH
753
754 # thumbnail image
7763b04e
JMF
755 # We try first to get a high quality image:
756 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
757 video_webpage, re.DOTALL)
758 if m_thumb is not None:
759 video_thumbnail = m_thumb.group(1)
760 elif 'thumbnail_url' not in video_info:
69ea8ca4 761 self._downloader.report_warning('unable to extract video thumbnail')
f490e77e 762 video_thumbnail = None
c5e8d7af
PH
763 else: # don't panic if we can't find it
764 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
765
766 # upload date
767 upload_date = None
ad3bc6ac 768 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
beee53de
PH
769 if mobj is None:
770 mobj = re.search(
263bd4ec 771 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
beee53de 772 video_webpage)
c5e8d7af
PH
773 if mobj is not None:
774 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
775 upload_date = unified_strdate(upload_date)
776
55f7bd2d
PH
777 m_cat_container = self._search_regex(
778 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
779 video_webpage, 'categories', fatal=False)
ec8deefc 780 if m_cat_container:
ad3bc6ac 781 category = self._html_search_regex(
01ed5c9b 782 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
783 default=None)
784 video_categories = None if category is None else [category]
785 else:
786 video_categories = None
ec8deefc 787
c5e8d7af
PH
788 # description
789 video_description = get_element_by_id("eow-description", video_webpage)
790 if video_description:
27dcce19
PH
791 video_description = re.sub(r'''(?x)
792 <a\s+
793 (?:[a-zA-Z-]+="[^"]+"\s+)*?
794 title="([^"]+)"\s+
795 (?:[a-zA-Z-]+="[^"]+"\s+)*?
796 class="yt-uix-redirect-link"\s*>
797 [^<]+
798 </a>
799 ''', r'\1', video_description)
c5e8d7af
PH
800 video_description = clean_html(video_description)
801 else:
802 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
803 if fd_mobj:
804 video_description = unescapeHTML(fd_mobj.group(1))
805 else:
78caa52a 806 video_description = ''
c5e8d7af 807
f30a38be 808 def _extract_count(count_name):
46374a56 809 count = self._search_regex(
f30a38be
JMF
810 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
811 video_webpage, count_name, default=None)
336c3a69
JMF
812 if count is not None:
813 return int(count.replace(',', ''))
814 return None
69ea8ca4
PH
815 like_count = _extract_count('like')
816 dislike_count = _extract_count('dislike')
336c3a69 817
c5e8d7af 818 # subtitles
d82134c3 819 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 820
c5e8d7af 821 if self._downloader.params.get('listsubtitles', False):
d665f8d3 822 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
823 return
824
825 if 'length_seconds' not in video_info:
69ea8ca4 826 self._downloader.report_warning('unable to extract video duration')
b466b702 827 video_duration = None
c5e8d7af 828 else:
b466b702 829 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 830
1fb07d10
JG
831 # annotations
832 video_annotations = None
833 if self._downloader.params.get('writeannotations', False):
834 video_annotations = self._extract_annotations(video_id)
835
c5e8d7af 836 # Decide which formats to download
c5e8d7af 837 try:
ae7ed920 838 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
50be92c1
PH
839 if not mobj:
840 raise ValueError('Could not find vevo ID')
ae7ed920
PH
841 json_code = uppercase_escape(mobj.group(1))
842 ytplayer_config = json.loads(json_code)
3489b7d2 843 args = ytplayer_config['args']
7ce7e394
JMF
844 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
845 # this signatures are encrypted
44d46655 846 if 'url_encoded_fmt_stream_map' not in args:
69ea8ca4 847 raise ValueError('No stream_map present') # caught below
00fe14fc
JMF
848 re_signature = re.compile(r'[&,]s=')
849 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
7ce7e394 850 if m_s is not None:
69ea8ca4 851 self.to_screen('%s: Encrypted signatures detected.' % video_id)
c5e8d7af 852 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
78caa52a 853 m_s = re_signature.search(args.get('adaptive_fmts', ''))
b7a68384 854 if m_s is not None:
00fe14fc
JMF
855 if 'adaptive_fmts' in video_info:
856 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
37b6d5f6 857 else:
00fe14fc 858 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
c5e8d7af
PH
859 except ValueError:
860 pass
861
dd27fd17
PH
862 def _map_to_format_list(urlmap):
863 formats = []
864 for itag, video_real_url in urlmap.items():
865 dct = {
866 'format_id': itag,
867 'url': video_real_url,
868 'player_url': player_url,
869 }
0b65e5d4
PH
870 if itag in self._formats:
871 dct.update(self._formats[itag])
dd27fd17
PH
872 formats.append(dct)
873 return formats
874
c5e8d7af
PH
875 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
876 self.report_rtmp_download()
dd27fd17
PH
877 formats = [{
878 'format_id': '_rtmp',
879 'protocol': 'rtmp',
880 'url': video_info['conn'][0],
881 'player_url': player_url,
882 }]
00fe14fc
JMF
883 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
884 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
885 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 886 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 887 url_map = {}
00fe14fc 888 for url_data_str in encoded_url_map.split(','):
c5e8d7af 889 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
890 if 'itag' not in url_data or 'url' not in url_data:
891 continue
892 format_id = url_data['itag'][0]
893 url = url_data['url'][0]
894
895 if 'sig' in url_data:
896 url += '&signature=' + url_data['sig'][0]
897 elif 's' in url_data:
898 encrypted_sig = url_data['s'][0]
899
900 if not age_gate:
901 jsplayer_url_json = self._search_regex(
902 r'"assets":.+?"js":\s*("[^"]+")',
78caa52a 903 video_webpage, 'JS player URL')
201e9eaa
PH
904 player_url = json.loads(jsplayer_url_json)
905 if player_url is None:
906 player_url_json = self._search_regex(
907 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 908 video_webpage, 'age gate player URL')
201e9eaa
PH
909 player_url = json.loads(player_url_json)
910
911 if self._downloader.params.get('verbose'):
cf010131 912 if player_url is None:
201e9eaa
PH
913 player_version = 'unknown'
914 player_desc = 'unknown'
915 else:
916 if player_url.endswith('swf'):
917 player_version = self._search_regex(
918 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 919 'flash player', fatal=False)
201e9eaa 920 player_desc = 'flash player %s' % player_version
cf010131 921 else:
201e9eaa
PH
922 player_version = self._search_regex(
923 r'html5player-([^/]+?)(?:/html5player)?\.js',
924 player_url,
925 'html5 player', fatal=False)
78caa52a 926 player_desc = 'html5 player %s' % player_version
201e9eaa 927
60064c53 928 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 929 self.to_screen('{%s} signature length %s, %s' %
98eb1c3f 930 (format_id, parts_sizes, player_desc))
201e9eaa
PH
931
932 signature = self._decrypt_signature(
933 encrypted_sig, video_id, player_url, age_gate)
934 url += '&signature=' + signature
935 if 'ratebypass' not in url:
936 url += '&ratebypass=yes'
937 url_map[format_id] = url
dd27fd17 938 formats = _map_to_format_list(url_map)
1d043b93
JMF
939 elif video_info.get('hlsvp'):
940 manifest_url = video_info['hlsvp'][0]
941 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 942 formats = _map_to_format_list(url_map)
c5e8d7af 943 else:
69ea8ca4 944 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 945
dd27fd17 946 # Look for the DASH manifest
203fb43f 947 if self._downloader.params.get('youtube_include_dash_manifest', True):
dd27fd17 948 try:
d68f0cdb 949 # The DASH manifest used needs to be the one from the original video_webpage.
950 # The one found in get_video_info seems to be using different signatures.
951 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
952 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
953 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
954 if age_gate:
3489b7d2 955 dash_manifest_url = video_info.get('dashmpd')[0]
d68f0cdb 956 else:
3489b7d2 957 dash_manifest_url = ytplayer_config['args']['dashmpd']
d68f0cdb 958 def decrypt_sig(mobj):
959 s = mobj.group(1)
960 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
961 return '/signature/%s' % dec_s
962 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
dd27fd17 963 dash_doc = self._download_xml(
d68f0cdb 964 dash_manifest_url, video_id,
69ea8ca4
PH
965 note='Downloading DASH manifest',
966 errnote='Could not download DASH manifest')
967 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
dd27fd17
PH
968 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
969 if url_el is None:
970 continue
971 format_id = r.attrib['id']
972 video_url = url_el.text
973 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
974 f = {
975 'format_id': format_id,
976 'url': video_url,
977 'width': int_or_none(r.attrib.get('width')),
978 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
979 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
980 'filesize': filesize,
981 }
982 try:
983 existing_format = next(
984 fo for fo in formats
985 if fo['format_id'] == format_id)
986 except StopIteration:
987 f.update(self._formats.get(format_id, {}))
988 formats.append(f)
989 else:
990 existing_format.update(f)
991
992 except (ExtractorError, KeyError) as e:
69ea8ca4 993 self.report_warning('Skipping DASH manifest: %s' % e, video_id)
d80044c2 994
4bcc7bd1 995 self._sort_formats(formats)
4ea3be0a 996
997 return {
998 'id': video_id,
999 'uploader': video_uploader,
1000 'uploader_id': video_uploader_id,
1001 'upload_date': upload_date,
1002 'title': video_title,
1003 'thumbnail': video_thumbnail,
1004 'description': video_description,
ec8deefc 1005 'categories': video_categories,
4ea3be0a 1006 'subtitles': video_subtitles,
1007 'duration': video_duration,
1008 'age_limit': 18 if age_gate else 0,
1009 'annotations': video_annotations,
7e8c0af0 1010 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
4ea3be0a 1011 'view_count': view_count,
1012 'like_count': like_count,
1013 'dislike_count': dislike_count,
1014 'formats': formats,
1015 }
c5e8d7af 1016
880e1c52 1017class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
78caa52a 1018 IE_DESC = 'YouTube.com playlists'
d67cc9fa 1019 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1020 (?:https?://)?
1021 (?:\w+\.)?
1022 youtube\.com/
1023 (?:
ac7553d0 1024 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
c5e8d7af
PH
1025 \? (?:.*?&)*? (?:p|a|list)=
1026 | p/
1027 )
d67cc9fa 1028 (
7d568f5a 1029 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
d67cc9fa
JMF
1030 # Top tracks, they can also include dots
1031 |(?:MC)[\w\.]*
1032 )
c5e8d7af
PH
1033 .*
1034 |
7d568f5a 1035 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1036 )"""
dbb94fb0 1037 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dcbb4580 1038 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
dbb94fb0 1039 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
78caa52a 1040 IE_NAME = 'youtube:playlist'
81127aa5
PH
1041 _TESTS = [{
1042 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1043 'info_dict': {
1044 'title': 'ytdl test PL',
1045 },
1046 'playlist_count': 3,
9291475f
PH
1047 }, {
1048 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1049 'info_dict': {
1050 'title': 'YDL_Empty_List',
1051 },
1052 'playlist_count': 0,
1053 }, {
1054 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1055 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1056 'info_dict': {
1057 'title': '29C3: Not my department',
1058 },
1059 'playlist_count': 95,
1060 }, {
1061 'note': 'issue #673',
1062 'url': 'PLBB231211A4F62143',
1063 'info_dict': {
f46a8702 1064 'title': '[OLD]Team Fortress 2 (Class-based LP)',
9291475f
PH
1065 },
1066 'playlist_mincount': 26,
1067 }, {
1068 'note': 'Large playlist',
1069 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1070 'info_dict': {
1071 'title': 'Uploads from Cauchemar',
1072 },
1073 'playlist_mincount': 799,
1074 }, {
1075 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1076 'info_dict': {
1077 'title': 'YDL_safe_search',
1078 },
1079 'playlist_count': 2,
ac7553d0
PH
1080 }, {
1081 'note': 'embedded',
1082 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1083 'playlist_count': 4,
1084 'info_dict': {
1085 'title': 'JODA15',
1086 }
6b08cdf6
PH
1087 }, {
1088 'note': 'Embedded SWF player',
1089 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1090 'playlist_count': 4,
1091 'info_dict': {
1092 'title': 'JODA7',
1093 }
81127aa5 1094 }]
c5e8d7af 1095
880e1c52
JMF
1096 def _real_initialize(self):
1097 self._login()
1098
652cdaa2 1099 def _ids_to_results(self, ids):
c9cc0bf5
PH
1100 return [
1101 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1102 for vid_id in ids]
652cdaa2
JMF
1103
1104 def _extract_mix(self, playlist_id):
1105 # The mixes are generated from a a single video
1106 # the id of the playlist is just 'RD' + video_id
7d4afc55 1107 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
c9cc0bf5 1108 webpage = self._download_webpage(
78caa52a 1109 url, playlist_id, 'Downloading Youtube mix')
bc2f773b 1110 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
1111 title_span = (
1112 search_title('playlist-title') or
1113 search_title('title long-title') or
1114 search_title('title'))
76d1700b 1115 title = clean_html(title_span)
c9cc0bf5
PH
1116 ids = orderedSet(re.findall(
1117 r'''(?xs)data-video-username=".*?".*?
1118 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1119 webpage))
652cdaa2
JMF
1120 url_results = self._ids_to_results(ids)
1121
1122 return self.playlist_result(url_results, playlist_id, title)
1123
c5e8d7af
PH
1124 def _real_extract(self, url):
1125 # Extract playlist id
d67cc9fa 1126 mobj = re.match(self._VALID_URL, url)
c5e8d7af 1127 if mobj is None:
69ea8ca4 1128 raise ExtractorError('Invalid URL: %s' % url)
47192f92
FV
1129 playlist_id = mobj.group(1) or mobj.group(2)
1130
1131 # Check if it's a video-specific URL
7c61bd36 1132 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1133 if 'v' in query_dict:
1134 video_id = query_dict['v'][0]
1135 if self._downloader.params.get('noplaylist'):
69ea8ca4 1136 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1137 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92 1138 else:
69ea8ca4 1139 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1140
7d4afc55 1141 if playlist_id.startswith('RD'):
652cdaa2
JMF
1142 # Mixes require a custom extraction process
1143 return self._extract_mix(playlist_id)
0a688bc0 1144 if playlist_id.startswith('TL'):
69ea8ca4 1145 raise ExtractorError('For downloading YouTube.com top lists, use '
78caa52a 1146 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1147
dbb94fb0
S
1148 url = self._TEMPLATE_URL % playlist_id
1149 page = self._download_webpage(url, playlist_id)
1150 more_widget_html = content_html = page
1151
10c0e2d8 1152 # Check if the playlist exists or is private
e399853d 1153 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
10c0e2d8 1154 raise ExtractorError(
78caa52a 1155 'The playlist doesn\'t exist or is private, use --username or '
10c0e2d8
JMF
1156 '--netrc to access it.',
1157 expected=True)
1158
dcbb4580
JMF
1159 # Extract the video ids from the playlist pages
1160 ids = []
c5e8d7af 1161
755eb032 1162 for page_num in itertools.count(1):
dbb94fb0 1163 matches = re.finditer(self._VIDEO_RE, content_html)
6e47b51e
JMF
1164 # We remove the duplicates and the link with index 0
1165 # (it's not the first video of the playlist)
1166 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1167 ids.extend(new_ids)
c5e8d7af 1168
dbb94fb0
S
1169 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1170 if not mobj:
c5e8d7af
PH
1171 break
1172
dbb94fb0 1173 more = self._download_json(
5912c639
PH
1174 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1175 'Downloading page #%s' % page_num,
1176 transform_source=uppercase_escape)
dbb94fb0
S
1177 content_html = more['content_html']
1178 more_widget_html = more['load_more_widget_html']
1179
1180 playlist_title = self._html_search_regex(
68eb8e90 1181 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
78caa52a 1182 page, 'title')
c5e8d7af 1183
652cdaa2 1184 url_results = self._ids_to_results(ids)
dcbb4580 1185 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1186
1187
0a688bc0 1188class YoutubeTopListIE(YoutubePlaylistIE):
78caa52a 1189 IE_NAME = 'youtube:toplist'
69ea8ca4 1190 IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"'
78caa52a 1191 ' (Example: "yttoplist:music:Top Tracks")')
0a688bc0 1192 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
cdc628a4
PH
1193 _TESTS = [{
1194 'url': 'yttoplist:music:Trending',
1195 'playlist_mincount': 5,
1196 'skip': 'Only works for logged-in users',
1197 }]
0a688bc0
JMF
1198
1199 def _real_extract(self, url):
1200 mobj = re.match(self._VALID_URL, url)
1201 channel = mobj.group('chann')
1202 title = mobj.group('title')
1203 query = compat_urllib_parse.urlencode({'title': title})
cdc628a4
PH
1204 channel_page = self._download_webpage(
1205 'https://www.youtube.com/%s' % channel, title)
1206 link = self._html_search_regex(
1207 r'''(?x)
1208 <a\s+href="([^"]+)".*?>\s*
1209 <span\s+class="branded-page-module-title-text">\s*
1210 <span[^>]*>.*?%s.*?</span>''' % re.escape(query),
1211 channel_page, 'list')
0a688bc0
JMF
1212 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1213
1214 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1215 ids = []
1216 # sometimes the webpage doesn't contain the videos
1217 # retry until we get them
1218 for i in itertools.count(0):
78caa52a 1219 msg = 'Downloading Youtube mix'
0a688bc0
JMF
1220 if i > 0:
1221 msg += ', retry #%d' % i
c9cc0bf5 1222
0a688bc0
JMF
1223 webpage = self._download_webpage(url, title, msg)
1224 ids = orderedSet(re.findall(video_re, webpage))
1225 if ids:
1226 break
1227 url_results = self._ids_to_results(ids)
1228 return self.playlist_result(url_results, playlist_title=title)
1229
1230
c5e8d7af 1231class YoutubeChannelIE(InfoExtractor):
78caa52a 1232 IE_DESC = 'YouTube.com channels'
c5e8d7af 1233 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1234 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
38c2e5b8 1235 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
78caa52a 1236 IE_NAME = 'youtube:channel'
cdc628a4
PH
1237 _TESTS = [{
1238 'note': 'paginated channel',
1239 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1240 'playlist_mincount': 91,
1241 }]
c5e8d7af
PH
1242
1243 def extract_videos_from_page(self, page):
1244 ids_in_page = []
1245 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1246 if mobj.group(1) not in ids_in_page:
1247 ids_in_page.append(mobj.group(1))
1248 return ids_in_page
1249
1250 def _real_extract(self, url):
1251 # Extract channel id
1252 mobj = re.match(self._VALID_URL, url)
1253 if mobj is None:
69ea8ca4 1254 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
1255
1256 # Download channel page
1257 channel_id = mobj.group(1)
1258 video_ids = []
b9643eed
JMF
1259 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1260 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1261 autogenerated = re.search(r'''(?x)
1262 class="[^"]*?(?:
1263 channel-header-autogenerated-label|
1264 yt-channel-title-autogenerated
1265 )[^"]*"''', channel_page) is not None
c5e8d7af 1266
b9643eed
JMF
1267 if autogenerated:
1268 # The videos are contained in a single page
1269 # the ajax pages can't be used, they are empty
1270 video_ids = self.extract_videos_from_page(channel_page)
1271 else:
1272 # Download all channel pages using the json-based channel_ajax query
1273 for pagenum in itertools.count(1):
1274 url = self._MORE_PAGES_URL % (pagenum, channel_id)
81c2f20b 1275 page = self._download_json(
69ea8ca4 1276 url, channel_id, note='Downloading page #%s' % pagenum,
81c2f20b
PH
1277 transform_source=uppercase_escape)
1278
b9643eed
JMF
1279 ids_in_page = self.extract_videos_from_page(page['content_html'])
1280 video_ids.extend(ids_in_page)
1281
1282 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1283 break
c5e8d7af 1284
69ea8ca4 1285 self._downloader.to_screen('[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
c5e8d7af 1286
7012b23c
PH
1287 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1288 for video_id in video_ids]
1289 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1290
1291
1292class YoutubeUserIE(InfoExtractor):
78caa52a 1293 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
c9ae7b95 1294 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
38c2e5b8 1295 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
c5e8d7af 1296 _GDATA_PAGE_SIZE = 50
38c2e5b8 1297 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
78caa52a 1298 IE_NAME = 'youtube:user'
c5e8d7af 1299
cdc628a4
PH
1300 _TESTS = [{
1301 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1302 'playlist_mincount': 320,
1303 'info_dict': {
1304 'title': 'TheLinuxFoundation',
1305 }
1306 }, {
1307 'url': 'ytuser:phihag',
1308 'only_matching': True,
1309 }]
1310
e3ea4790 1311 @classmethod
f4b05232 1312 def suitable(cls, url):
e3ea4790
JMF
1313 # Don't return True if the url can be extracted with other youtube
1314 # extractor, the regex would is too permissive and it would match.
1315 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1316 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1317 else: return super(YoutubeUserIE, cls).suitable(url)
1318
c5e8d7af
PH
1319 def _real_extract(self, url):
1320 # Extract username
1321 mobj = re.match(self._VALID_URL, url)
1322 if mobj is None:
69ea8ca4 1323 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
1324
1325 username = mobj.group(1)
1326
1327 # Download video ids using YouTube Data API. Result size per
1328 # query is limited (currently to 50 videos) so we need to query
1329 # page by page until there are no video ids - it means we got
1330 # all of them.
1331
b7ab0590 1332 def download_page(pagenum):
c5e8d7af
PH
1333 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1334
1335 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1336 page = self._download_webpage(
1337 gdata_url, username,
78caa52a 1338 'Downloading video ids from %d to %d' % (
b7ab0590 1339 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1340
fd9cf738
JMF
1341 try:
1342 response = json.loads(page)
1343 except ValueError as err:
69ea8ca4 1344 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
71c82637 1345 if 'entry' not in response['feed']:
b7ab0590 1346 return
fd9cf738 1347
c5e8d7af 1348 # Extract video identifiers
e302f9ce
PH
1349 entries = response['feed']['entry']
1350 for entry in entries:
1351 title = entry['title']['$t']
1352 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1353 yield {
e302f9ce
PH
1354 '_type': 'url',
1355 'url': video_id,
1356 'ie_key': 'Youtube',
b11cec41 1357 'id': video_id,
e302f9ce 1358 'title': title,
b7ab0590 1359 }
9c44d242 1360 url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1361
7012b23c
PH
1362 return self.playlist_result(url_results, playlist_title=username)
1363
b05654f0
PH
1364
1365class YoutubeSearchIE(SearchInfoExtractor):
78caa52a
PH
1366 IE_DESC = 'YouTube.com searches'
1367 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
b05654f0 1368 _MAX_RESULTS = 1000
78caa52a 1369 IE_NAME = 'youtube:search'
b05654f0
PH
1370 _SEARCH_KEY = 'ytsearch'
1371
b05654f0
PH
1372 def _get_n_results(self, query, n):
1373 """Get a specified number of results for a query"""
1374
1375 video_ids = []
1376 pagenum = 0
1377 limit = n
83d548ef 1378 PAGE_SIZE = 50
b05654f0 1379
83d548ef
PH
1380 while (PAGE_SIZE * pagenum) < limit:
1381 result_url = self._API_URL % (
1382 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1383 (PAGE_SIZE * pagenum) + 1)
7cc3570e 1384 data_json = self._download_webpage(
69ea8ca4
PH
1385 result_url, video_id='query "%s"' % query,
1386 note='Downloading page %s' % (pagenum + 1),
1387 errnote='Unable to download API page')
7cc3570e
PH
1388 data = json.loads(data_json)
1389 api_response = data['data']
1390
1391 if 'items' not in api_response:
07ad22b8 1392 raise ExtractorError(
78caa52a 1393 '[youtube] No video results', expected=True)
b05654f0
PH
1394
1395 new_ids = list(video['id'] for video in api_response['items'])
1396 video_ids += new_ids
1397
1398 limit = min(n, api_response['totalItems'])
1399 pagenum += 1
1400
1401 if len(video_ids) > n:
1402 video_ids = video_ids[:n]
7012b23c
PH
1403 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1404 for video_id in video_ids]
b05654f0 1405 return self.playlist_result(videos, query)
75dff0ee 1406
c9ae7b95 1407
a3dd9248 1408class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1409 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1410 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1411 _SEARCH_KEY = 'ytsearchdate'
78caa52a 1412 IE_DESC = 'YouTube.com searches, newest videos first'
75dff0ee 1413
c9ae7b95
PH
1414
1415class YoutubeSearchURLIE(InfoExtractor):
78caa52a
PH
1416 IE_DESC = 'YouTube.com search URLs'
1417 IE_NAME = 'youtube:search_url'
c9ae7b95 1418 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
1419 _TESTS = [{
1420 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1421 'playlist_mincount': 5,
1422 'info_dict': {
1423 'title': 'youtube-dl test video',
1424 }
1425 }]
c9ae7b95
PH
1426
1427 def _real_extract(self, url):
1428 mobj = re.match(self._VALID_URL, url)
1429 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1430
1431 webpage = self._download_webpage(url, query)
1432 result_code = self._search_regex(
78caa52a 1433 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
c9ae7b95
PH
1434
1435 part_codes = re.findall(
1436 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1437 entries = []
1438 for part_code in part_codes:
1439 part_title = self._html_search_regex(
6feb2d5e 1440 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1441 part_url_snippet = self._html_search_regex(
1442 r'(?s)href="([^"]+)"', part_code, 'item URL')
1443 part_url = compat_urlparse.urljoin(
1444 'https://www.youtube.com/', part_url_snippet)
1445 entries.append({
1446 '_type': 'url',
1447 'url': part_url,
1448 'title': part_title,
1449 })
1450
1451 return {
1452 '_type': 'playlist',
1453 'entries': entries,
1454 'title': query,
1455 }
1456
1457
75dff0ee 1458class YoutubeShowIE(InfoExtractor):
78caa52a 1459 IE_DESC = 'YouTube.com (multi-season) shows'
cdc628a4 1460 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 1461 IE_NAME = 'youtube:show'
cdc628a4
PH
1462 _TESTS = [{
1463 'url': 'http://www.youtube.com/show/airdisasters',
1464 'playlist_mincount': 3,
1465 'info_dict': {
1466 'id': 'airdisasters',
1467 'title': 'Air Disasters',
1468 }
1469 }]
75dff0ee
JMF
1470
1471 def _real_extract(self, url):
1472 mobj = re.match(self._VALID_URL, url)
cdc628a4
PH
1473 playlist_id = mobj.group('id')
1474 webpage = self._download_webpage(
1475 url, playlist_id, 'Downloading show webpage')
75dff0ee
JMF
1476 # There's one playlist for each season of the show
1477 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
cdc628a4
PH
1478 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1479 entries = [
1480 self.url_result(
1481 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1482 for season in m_seasons
1483 ]
1484 title = self._og_search_title(webpage, fatal=False)
1485
1486 return {
1487 '_type': 'playlist',
1488 'id': playlist_id,
1489 'title': title,
1490 'entries': entries,
1491 }
04cc9617
JMF
1492
1493
b2e8bc1b 1494class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1495 """
1496 Base class for extractors that fetch info from
1497 http://www.youtube.com/feed_ajax
1498 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1499 """
b2e8bc1b 1500 _LOGIN_REQUIRED = True
43ba5456
JMF
1501 # use action_load_personal_feed instead of action_load_system_feed
1502 _PERSONAL_FEED = False
04cc9617 1503
d7ae0639
JMF
1504 @property
1505 def _FEED_TEMPLATE(self):
43ba5456
JMF
1506 action = 'action_load_system_feed'
1507 if self._PERSONAL_FEED:
1508 action = 'action_load_personal_feed'
38c2e5b8 1509 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1510
1511 @property
1512 def IE_NAME(self):
78caa52a 1513 return 'youtube:%s' % self._FEED_NAME
04cc9617 1514
81f0259b 1515 def _real_initialize(self):
b2e8bc1b 1516 self._login()
81f0259b 1517
04cc9617
JMF
1518 def _real_extract(self, url):
1519 feed_entries = []
0e44d838
JMF
1520 paging = 0
1521 for i in itertools.count(1):
f6177462 1522 info = self._download_json(self._FEED_TEMPLATE % paging,
78caa52a
PH
1523 '%s feed' % self._FEED_NAME,
1524 'Downloading page %s' % i)
f6177462 1525 feed_html = info.get('feed_html') or info.get('content_html')
1a9b9649 1526 load_more_widget_html = info.get('load_more_widget_html') or feed_html
43ba5456 1527 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1528 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1529 feed_entries.extend(
1530 self.url_result(video_id, 'Youtube', video_id=video_id)
1531 for video_id in ids)
05ee2b6d
JMF
1532 mobj = re.search(
1533 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1a9b9649 1534 load_more_widget_html)
05ee2b6d 1535 if mobj is None:
04cc9617 1536 break
05ee2b6d 1537 paging = mobj.group('paging')
d7ae0639
JMF
1538 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1539
d7ae0639 1540class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
78caa52a 1541 IE_DESC = 'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
d7ae0639
JMF
1542 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1543 _FEED_NAME = 'recommended'
78caa52a 1544 _PLAYLIST_TITLE = 'Youtube Recommended videos'
c626a3d9 1545
43ba5456 1546class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
78caa52a 1547 IE_DESC = 'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
43ba5456
JMF
1548 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1549 _FEED_NAME = 'watch_later'
78caa52a 1550 _PLAYLIST_TITLE = 'Youtube Watch Later'
43ba5456 1551 _PERSONAL_FEED = True
c626a3d9 1552
f459d170 1553class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
78caa52a
PH
1554 IE_DESC = 'Youtube watch history, "ythistory" keyword (requires authentication)'
1555 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
f459d170
JMF
1556 _FEED_NAME = 'history'
1557 _PERSONAL_FEED = True
78caa52a 1558 _PLAYLIST_TITLE = 'Youtube Watch History'
f459d170 1559
c626a3d9 1560class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a
PH
1561 IE_NAME = 'youtube:favorites'
1562 IE_DESC = 'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1563 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1564 _LOGIN_REQUIRED = True
1565
1566 def _real_extract(self, url):
1567 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 1568 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 1569 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1570
1571
1ed5b5c9 1572class YoutubeSubscriptionsIE(YoutubePlaylistIE):
78caa52a
PH
1573 IE_NAME = 'youtube:subscriptions'
1574 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1ed5b5c9 1575 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
81127aa5 1576 _TESTS = []
1ed5b5c9
JMF
1577
1578 def _real_extract(self, url):
78caa52a 1579 title = 'Youtube Subscriptions'
1ed5b5c9
JMF
1580 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1581
1582 # The extraction process is the same as for playlists, but the regex
1583 # for the video ids doesn't contain an index
1584 ids = []
1585 more_widget_html = content_html = page
1586
1587 for page_num in itertools.count(1):
1588 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1589 new_ids = orderedSet(matches)
1590 ids.extend(new_ids)
1591
1592 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1593 if not mobj:
1594 break
1595
1596 more = self._download_json(
1597 'https://youtube.com/%s' % mobj.group('more'), title,
1598 'Downloading page #%s' % page_num,
1599 transform_source=uppercase_escape)
1600 content_html = more['content_html']
1601 more_widget_html = more['load_more_widget_html']
1602
1603 return {
1604 '_type': 'playlist',
1605 'title': title,
1606 'entries': self._ids_to_results(ids),
1607 }
1608
1609
15870e90
PH
1610class YoutubeTruncatedURLIE(InfoExtractor):
1611 IE_NAME = 'youtube:truncated_url'
1612 IE_DESC = False # Do not list
975d35db 1613 _VALID_URL = r'''(?x)
c4808c60
PH
1614 (?:https?://)?[^/]+/watch\?(?:
1615 feature=[a-z_]+|
1616 annotation_id=annotation_[^&]+
1617 )?$|
975d35db
PH
1618 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1619 '''
15870e90 1620
c4808c60
PH
1621 _TESTS = [{
1622 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1623 'only_matching': True,
dc2fc736
PH
1624 }, {
1625 'url': 'http://www.youtube.com/watch?',
1626 'only_matching': True,
c4808c60
PH
1627 }]
1628
15870e90
PH
1629 def _real_extract(self, url):
1630 raise ExtractorError(
78caa52a
PH
1631 'Did you forget to quote the URL? Remember that & is a meta '
1632 'character in most shells, so you want to put the URL in quotes, '
1633 'like youtube-dl '
1634 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1635 ' or simply youtube-dl BaW_jenozKc .',
15870e90 1636 expected=True)