]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[livestream:original] Fix RTMP parameters (Fixes #4040)
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
e0df6211 10import traceback
c5e8d7af 11
b05654f0 12from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 13from .subtitles import SubtitlesInfoExtractor
2b25cb5d 14from ..jsinterp import JSInterpreter
54256267 15from ..swfinterp import SWFInterpreter
c5e8d7af 16from ..utils import (
edf3e38e 17 compat_chr,
c5e8d7af 18 compat_parse_qs,
c5e8d7af
PH
19 compat_urllib_parse,
20 compat_urllib_request,
7c61bd36 21 compat_urlparse,
c5e8d7af
PH
22 compat_str,
23
24 clean_html,
25 get_element_by_id,
652cdaa2 26 get_element_by_attribute,
c5e8d7af 27 ExtractorError,
dd27fd17 28 int_or_none,
9c44d242 29 OnDemandPagedList,
c5e8d7af
PH
30 unescapeHTML,
31 unified_strdate,
04cc9617 32 orderedSet,
81c2f20b 33 uppercase_escape,
c5e8d7af
PH
34)
35
de7f3446 36class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
37 """Provide base functions for Youtube extractors"""
38 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
83317f69 39 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
b2e8bc1b 40 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
38c2e5b8 41 _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
b2e8bc1b
JMF
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
45
b2e8bc1b 46 def _set_language(self):
7cc3570e
PH
47 return bool(self._download_webpage(
48 self._LANG_URL, None,
69ea8ca4 49 note='Setting language', errnote='unable to set language',
7cc3570e 50 fatal=False))
b2e8bc1b
JMF
51
52 def _login(self):
83317f69 53 """
54 Attempt to log in to YouTube.
55 True is returned if successful or skipped.
56 False is returned if login failed.
57
58 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
59 """
b2e8bc1b
JMF
60 (username, password) = self._get_login_info()
61 # No authentication to be performed
62 if username is None:
63 if self._LOGIN_REQUIRED:
69ea8ca4 64 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 65 return True
b2e8bc1b 66
7cc3570e
PH
67 login_page = self._download_webpage(
68 self._LOGIN_URL, None,
69ea8ca4
PH
69 note='Downloading login page',
70 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
71 if login_page is False:
72 return
b2e8bc1b 73
795f28f8 74 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78caa52a 75 login_page, 'Login GALX parameter')
c5e8d7af 76
b2e8bc1b
JMF
77 # Log in
78 login_form_strs = {
78caa52a
PH
79 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
80 'Email': username,
81 'GALX': galx,
82 'Passwd': password,
83
84 'PersistentCookie': 'yes',
85 '_utf8': '霱',
86 'bgresponse': 'js_disabled',
87 'checkConnection': '',
88 'checkedDomains': 'youtube',
89 'dnConn': '',
90 'pstMsg': '0',
91 'rmShown': '1',
92 'secTok': '',
93 'signIn': 'Sign in',
94 'timeStmp': '',
95 'service': 'youtube',
96 'uilel': '3',
97 'hl': 'en_US',
b2e8bc1b 98 }
83317f69 99
b2e8bc1b
JMF
100 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
101 # chokes on unicode
102 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
103 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
104
105 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
106 login_results = self._download_webpage(
107 req, None,
69ea8ca4 108 note='Logging in', errnote='unable to log in', fatal=False)
7cc3570e
PH
109 if login_results is False:
110 return False
83317f69 111
112 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
69ea8ca4 113 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
83317f69 114
115 # Two-Factor
116 # TODO add SMS and phone call support - these require making a request and then prompting the user
117
118 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
119 tfa_code = self._get_tfa_info()
120
121 if tfa_code is None:
69ea8ca4
PH
122 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
123 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
83317f69 124 return False
125
126 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
127
128 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
129 if match is None:
69ea8ca4 130 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
83317f69 131 secTok = match.group(1)
132 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
133 if match is None:
69ea8ca4 134 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
83317f69 135 timeStmp = match.group(1)
136
137 tfa_form_strs = {
78caa52a
PH
138 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
139 'smsToken': '',
140 'smsUserPin': tfa_code,
141 'smsVerifyPin': 'Verify',
142
143 'PersistentCookie': 'yes',
144 'checkConnection': '',
145 'checkedDomains': 'youtube',
146 'pstMsg': '1',
147 'secTok': secTok,
148 'timeStmp': timeStmp,
149 'service': 'youtube',
150 'hl': 'en_US',
83317f69 151 }
152 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in tfa_form_strs.items())
153 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
154
155 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
156 tfa_results = self._download_webpage(
157 tfa_req, None,
69ea8ca4 158 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
83317f69 159
160 if tfa_results is False:
161 return False
162
163 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
69ea8ca4 164 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
83317f69 165 return False
166 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
69ea8ca4 167 self._downloader.report_warning('unable to log in - did the page structure change?')
83317f69 168 return False
169 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
69ea8ca4 170 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
83317f69 171 return False
172
7cc3570e 173 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
69ea8ca4 174 self._downloader.report_warning('unable to log in: bad username or password')
b2e8bc1b
JMF
175 return False
176 return True
177
178 def _confirm_age(self):
179 age_form = {
7cc3570e
PH
180 'next_url': '/',
181 'action_confirm': 'Confirm',
182 }
5700e779
JMF
183 req = compat_urllib_request.Request(self._AGE_URL,
184 compat_urllib_parse.urlencode(age_form).encode('ascii'))
7cc3570e
PH
185
186 self._download_webpage(
187 req, None,
69ea8ca4 188 note='Confirming age', errnote='Unable to confirm age')
b2e8bc1b
JMF
189 return True
190
191 def _real_initialize(self):
192 if self._downloader is None:
193 return
6b445558
PH
194 if self._get_login_info()[0] is not None:
195 if not self._set_language():
196 return
b2e8bc1b
JMF
197 if not self._login():
198 return
199 self._confirm_age()
c5e8d7af 200
8377574c 201
de7f3446 202class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
78caa52a 203 IE_DESC = 'YouTube.com'
cb7dfeea 204 _VALID_URL = r"""(?x)^
c5e8d7af 205 (
edb53e2d 206 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 207 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 208 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 209 (?:www\.)?pwnyoutube\.com/|
f7000f3a 210 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
211 tube\.majestyc\.net/|
212 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
213 (?:.*?\#/)? # handle anchor (#/) redirect urls
214 (?: # the various things that can precede the ID:
ac7553d0 215 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 216 |(?: # or the v= param in all its forms
f7000f3a 217 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
218 (?:\?|\#!?) # the params delimiter ? or # or #!
219 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
220 v=
221 )
f4b05232
JMF
222 ))
223 |youtu\.be/ # just youtu.be/xxxx
edb53e2d 224 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 225 )
c5e8d7af 226 )? # all until now is optional -> you can pass the naked ID
8963d9c2 227 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
9291475f 228 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
c5e8d7af
PH
229 (?(1).+)? # if we found the ID, everything can follow
230 $"""
c5e8d7af 231 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
232 _formats = {
233 '5': {'ext': 'flv', 'width': 400, 'height': 240},
234 '6': {'ext': 'flv', 'width': 450, 'height': 270},
235 '13': {'ext': '3gp'},
236 '17': {'ext': '3gp', 'width': 176, 'height': 144},
237 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
238 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
239 '34': {'ext': 'flv', 'width': 640, 'height': 360},
240 '35': {'ext': 'flv', 'width': 854, 'height': 480},
241 '36': {'ext': '3gp', 'width': 320, 'height': 240},
242 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
243 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
244 '43': {'ext': 'webm', 'width': 640, 'height': 360},
245 '44': {'ext': 'webm', 'width': 854, 'height': 480},
246 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
247 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
248
1d043b93 249
86fe61c8 250 # 3d videos
43b81eb9
PH
251 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
252 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
253 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
254 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
255 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
256 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
257 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 258
96fb5605 259 # Apple HTTP Live Streaming
43b81eb9
PH
260 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
261 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
262 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
263 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
264 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
265 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
266 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
267
268 # DASH mp4 video
43b81eb9
PH
269 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
270 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
271 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
272 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
273 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
274 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
275 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
276 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
836a086c 277
f6f1fc92 278 # Dash mp4 audio
2c62dc26
PH
279 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
280 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
281 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
282
283 # Dash webm
e75cafe9
A
284 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
285 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
286 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
287 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
288 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
289 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
1cc887cb 290 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
e75cafe9
A
291 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
292 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
293 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
294 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
295 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
296 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
297 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 298 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 299 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
2c62dc26
PH
300
301 # Dash webm audio
55db73ef 302 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
e75cafe9 303 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d
PH
304
305 # RTMP (unnamed)
306 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 307 }
836a086c 308
78caa52a 309 IE_NAME = 'youtube'
2eb88d95
PH
310 _TESTS = [
311 {
4bc3a23e
PH
312 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
313 'info_dict': {
314 'id': 'BaW_jenozKc',
315 'ext': 'mp4',
316 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
317 'uploader': 'Philipp Hagemeister',
318 'uploader_id': 'phihag',
319 'upload_date': '20121002',
320 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
321 'categories': ['Science & Technology'],
3e7c1224
PH
322 'like_count': int,
323 'dislike_count': int,
2eb88d95 324 }
0e853ca4 325 },
0e853ca4 326 {
4bc3a23e
PH
327 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
328 'note': 'Test generic use_cipher_signature video (#897)',
329 'info_dict': {
330 'id': 'UxxajLWwzqY',
331 'ext': 'mp4',
332 'upload_date': '20120506',
333 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
334 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
335 'uploader': 'Icona Pop',
336 'uploader_id': 'IconaPop',
2eb88d95 337 }
c108eb73
JMF
338 },
339 {
4bc3a23e
PH
340 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
341 'note': 'Test VEVO video with age protection (#956)',
342 'info_dict': {
343 'id': '07FYdnEawAQ',
344 'ext': 'mp4',
345 'upload_date': '20130703',
346 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
347 'description': 'md5:64249768eec3bc4276236606ea996373',
348 'uploader': 'justintimberlakeVEVO',
349 'uploader_id': 'justintimberlakeVEVO',
c108eb73
JMF
350 }
351 },
fccd3771 352 {
4bc3a23e
PH
353 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
354 'note': 'Embed-only video (#1746)',
355 'info_dict': {
356 'id': 'yZIXLfi8CZQ',
357 'ext': 'mp4',
358 'upload_date': '20120608',
359 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
360 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
361 'uploader': 'SET India',
362 'uploader_id': 'setindia'
fccd3771
PH
363 }
364 },
dd27fd17 365 {
4bc3a23e
PH
366 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
367 'note': '256k DASH audio (format 141) via DASH manifest',
368 'info_dict': {
369 'id': 'a9LDPn-MO4I',
370 'ext': 'm4a',
371 'upload_date': '20121002',
372 'uploader_id': '8KVIDEO',
373 'description': '',
374 'uploader': '8KVIDEO',
375 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 376 },
4bc3a23e
PH
377 'params': {
378 'youtube_include_dash_manifest': True,
379 'format': '141',
4919603f 380 },
dd27fd17 381 },
3489b7d2
JMF
382 # DASH manifest with encrypted signature
383 {
78caa52a
PH
384 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
385 'info_dict': {
386 'id': 'IB3lcPjvWLA',
387 'ext': 'm4a',
388 'title': 'Afrojack - The Spark ft. Spree Wilson',
389 'description': 'md5:9717375db5a9a3992be4668bbf3bc0a8',
390 'uploader': 'AfrojackVEVO',
391 'uploader_id': 'AfrojackVEVO',
392 'upload_date': '20131011',
3489b7d2 393 },
4bc3a23e 394 'params': {
78caa52a
PH
395 'youtube_include_dash_manifest': True,
396 'format': '141',
3489b7d2
JMF
397 },
398 },
2eb88d95
PH
399 ]
400
e0df6211
PH
401 def __init__(self, *args, **kwargs):
402 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 403 self._player_cache = {}
e0df6211 404
c5e8d7af
PH
405 def report_video_info_webpage_download(self, video_id):
406 """Report attempt to download video info webpage."""
69ea8ca4 407 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 408
c5e8d7af
PH
409 def report_information_extraction(self, video_id):
410 """Report attempt to extract video information."""
69ea8ca4 411 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
412
413 def report_unavailable_format(self, video_id, format):
414 """Report extracted video URL."""
69ea8ca4 415 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
416
417 def report_rtmp_download(self):
418 """Indicate the download will use the RTMP protocol."""
69ea8ca4 419 self.to_screen('RTMP download detected')
c5e8d7af 420
60064c53
PH
421 def _signature_cache_id(self, example_sig):
422 """ Return a string representation of a signature """
78caa52a 423 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
424
425 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 426 id_m = re.match(
c081b35c 427 r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
cf010131 428 player_url)
c081b35c
PH
429 if not id_m:
430 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
431 player_type = id_m.group('ext')
432 player_id = id_m.group('id')
433
c4417ddb 434 # Read from filesystem cache
60064c53
PH
435 func_id = '%s_%s_%s' % (
436 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 437 assert os.path.basename(func_id) == func_id
a0e07d31 438
69ea8ca4 439 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 440 if cache_spec is not None:
78caa52a 441 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 442
e0df6211
PH
443 if player_type == 'js':
444 code = self._download_webpage(
445 player_url, video_id,
69ea8ca4
PH
446 note='Downloading %s player %s' % (player_type, player_id),
447 errnote='Download of %s failed' % player_url)
83799698 448 res = self._parse_sig_js(code)
c4417ddb 449 elif player_type == 'swf':
e0df6211
PH
450 urlh = self._request_webpage(
451 player_url, video_id,
69ea8ca4
PH
452 note='Downloading %s player %s' % (player_type, player_id),
453 errnote='Download of %s failed' % player_url)
e0df6211 454 code = urlh.read()
83799698 455 res = self._parse_sig_swf(code)
e0df6211
PH
456 else:
457 assert False, 'Invalid player type %r' % player_type
458
a0e07d31 459 if cache_spec is None:
78caa52a 460 test_string = ''.join(map(compat_chr, range(len(example_sig))))
a0e07d31
PH
461 cache_res = res(test_string)
462 cache_spec = [ord(c) for c in cache_res]
83799698 463
69ea8ca4 464 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
465 return res
466
60064c53 467 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
468 def gen_sig_code(idxs):
469 def _genslice(start, end, step):
78caa52a 470 starts = '' if start == 0 else str(start)
69ea8ca4
PH
471 ends = (':%d' % (end+step)) if end + step >= 0 else ':'
472 steps = '' if step == 1 else (':%d' % step)
78caa52a 473 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
474
475 step = None
0ca96d48
PH
476 start = '(Never used)' # Quelch pyflakes warnings - start will be
477 # set as soon as step is set
edf3e38e
PH
478 for i, prev in zip(idxs[1:], idxs[:-1]):
479 if step is not None:
480 if i - prev == step:
481 continue
482 yield _genslice(start, prev, step)
483 step = None
484 continue
485 if i - prev in [-1, 1]:
486 step = i - prev
487 start = prev
488 continue
489 else:
78caa52a 490 yield 's[%d]' % prev
edf3e38e 491 if step is None:
78caa52a 492 yield 's[%d]' % i
edf3e38e
PH
493 else:
494 yield _genslice(start, i, step)
495
78caa52a 496 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 497 cache_res = func(test_string)
edf3e38e 498 cache_spec = [ord(c) for c in cache_res]
78caa52a 499 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
500 signature_id_tuple = '(%s)' % (
501 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 502 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 503 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 504 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 505
e0df6211
PH
506 def _parse_sig_js(self, jscode):
507 funcname = self._search_regex(
c26e9ac4 508 r'signature=([$a-zA-Z]+)', jscode,
78caa52a 509 'Initial JS player signature function name')
2b25cb5d
PH
510
511 jsi = JSInterpreter(jscode)
512 initial_function = jsi.extract_function(funcname)
e0df6211
PH
513 return lambda s: initial_function([s])
514
515 def _parse_sig_swf(self, file_contents):
54256267 516 swfi = SWFInterpreter(file_contents)
78caa52a 517 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 518 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 519 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
520 return lambda s: initial_function([s])
521
83799698 522 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 523 """Turn the encrypted s field into a working signature"""
6b37f0be 524
c8bf86d5 525 if player_url is None:
69ea8ca4 526 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 527
69ea8ca4 528 if player_url.startswith('//'):
78caa52a 529 player_url = 'https:' + player_url
c8bf86d5 530 try:
62af3a0e 531 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
532 if player_id not in self._player_cache:
533 func = self._extract_signature_function(
60064c53 534 video_id, player_url, s
c8bf86d5
PH
535 )
536 self._player_cache[player_id] = func
537 func = self._player_cache[player_id]
538 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 539 self._print_sig_code(func, s)
c8bf86d5
PH
540 return func(s)
541 except Exception as e:
542 tb = traceback.format_exc()
543 raise ExtractorError(
78caa52a 544 'Signature extraction failed: ' + tb, cause=e)
e0df6211 545
1f343eaa 546 def _get_available_subtitles(self, video_id, webpage):
de7f3446 547 try:
7fad1c63 548 sub_list = self._download_webpage(
38c2e5b8 549 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
550 video_id, note=False)
551 except ExtractorError as err:
69ea8ca4 552 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
de7f3446
JMF
553 return {}
554 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
555
556 sub_lang_list = {}
557 for l in lang_list:
558 lang = l[1]
7e660ac1
LD
559 if lang in sub_lang_list:
560 continue
de7f3446
JMF
561 params = compat_urllib_parse.urlencode({
562 'lang': lang,
563 'v': video_id,
ca715127 564 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
c3197e3e 565 'name': unescapeHTML(l[0]).encode('utf-8'),
de7f3446 566 })
78caa52a 567 url = 'https://www.youtube.com/api/timedtext?' + params
de7f3446
JMF
568 sub_lang_list[lang] = url
569 if not sub_lang_list:
69ea8ca4 570 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
571 return {}
572 return sub_lang_list
573
055e6f36 574 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
575 """We need the webpage for getting the captions url, pass it as an
576 argument to speed up the process."""
ca715127 577 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
69ea8ca4 578 self.to_screen('%s: Looking for automatic captions' % video_id)
de7f3446 579 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
78caa52a 580 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
581 if mobj is None:
582 self._downloader.report_warning(err_msg)
583 return {}
584 player_config = json.loads(mobj.group(1))
585 try:
586 args = player_config[u'args']
587 caption_url = args[u'ttsurl']
588 timestamp = args[u'timestamp']
055e6f36
JMF
589 # We get the available subtitles
590 list_params = compat_urllib_parse.urlencode({
591 'type': 'list',
592 'tlangs': 1,
593 'asrs': 1,
de7f3446 594 })
055e6f36 595 list_url = caption_url + '&' + list_params
e26f8712 596 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 597 original_lang_node = caption_list.find('track')
f6a54188 598 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
69ea8ca4 599 self._downloader.report_warning('Video doesn\'t have automatic captions')
e3dc22ca
JMF
600 return {}
601 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
602
603 sub_lang_list = {}
604 for lang_node in caption_list.findall('target'):
605 sub_lang = lang_node.attrib['lang_code']
606 params = compat_urllib_parse.urlencode({
607 'lang': original_lang,
608 'tlang': sub_lang,
609 'fmt': sub_format,
610 'ts': timestamp,
611 'kind': 'asr',
612 })
613 sub_lang_list[sub_lang] = caption_url + '&' + params
614 return sub_lang_list
de7f3446
JMF
615 # An extractor error can be raise by the download process if there are
616 # no automatic captions but there are subtitles
617 except (KeyError, ExtractorError):
618 self._downloader.report_warning(err_msg)
619 return {}
620
97665381
PH
621 @classmethod
622 def extract_id(cls, url):
623 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 624 if mobj is None:
69ea8ca4 625 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
626 video_id = mobj.group(2)
627 return video_id
628
1d043b93
JMF
629 def _extract_from_m3u8(self, manifest_url, video_id):
630 url_map = {}
631 def _get_urls(_manifest):
632 lines = _manifest.split('\n')
633 urls = filter(lambda l: l and not l.startswith('#'),
634 lines)
635 return urls
78caa52a 636 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
1d043b93
JMF
637 formats_urls = _get_urls(manifest)
638 for format_url in formats_urls:
890f62e8 639 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
640 url_map[itag] = format_url
641 return url_map
642
1fb07d10
JG
643 def _extract_annotations(self, video_id):
644 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 645 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 646
c5e8d7af 647 def _real_extract(self, url):
7e8c0af0 648 proto = (
78caa52a
PH
649 'http' if self._downloader.params.get('prefer_insecure', False)
650 else 'https')
7e8c0af0 651
c5e8d7af
PH
652 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
653 mobj = re.search(self._NEXT_URL_RE, url)
654 if mobj:
7e8c0af0 655 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
97665381 656 video_id = self.extract_id(url)
c5e8d7af
PH
657
658 # Get video webpage
7e8c0af0 659 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
a1f934b1
PH
660 pref_cookies = [
661 c for c in self._downloader.cookiejar
662 if c.domain == '.youtube.com' and c.name == 'PREF']
663 for pc in pref_cookies:
664 if 'hl=' in pc.value:
665 pc.value = re.sub(r'hl=[^&]+', 'hl=en', pc.value)
666 else:
667 if pc.value:
668 pc.value += '&'
669 pc.value += 'hl=en'
670 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
671
672 # Attempt to extract SWF player URL
e0df6211 673 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
674 if mobj is not None:
675 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
676 else:
677 player_url = None
678
679 # Get video info
680 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
681 if re.search(r'player-age-gate-content">', video_webpage) is not None:
682 self.report_age_confirmation()
683 age_gate = True
684 # We simulate the access to the video from www.youtube.com/v/{video_id}
685 # this can be viewed without login into Youtube
2c57c7fa
JMF
686 data = compat_urllib_parse.urlencode({
687 'video_id': video_id,
688 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934
JMF
689 'sts': self._search_regex(
690 r'"sts"\s*:\s*(\d+)', video_webpage, 'sts'),
2c57c7fa 691 })
7e8c0af0 692 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
693 video_info_webpage = self._download_webpage(video_info_url, video_id,
694 note=False,
695 errnote='unable to download video info webpage')
696 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
697 else:
698 age_gate = False
699 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
7e8c0af0 700 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
c108eb73
JMF
701 % (video_id, el_type))
702 video_info_webpage = self._download_webpage(video_info_url, video_id,
703 note=False,
704 errnote='unable to download video info webpage')
705 video_info = compat_parse_qs(video_info_webpage)
706 if 'token' in video_info:
707 break
c5e8d7af
PH
708 if 'token' not in video_info:
709 if 'reason' in video_info:
d11271dd 710 raise ExtractorError(
78caa52a 711 'YouTube said: %s' % video_info['reason'][0],
d11271dd 712 expected=True, video_id=video_id)
c5e8d7af 713 else:
d11271dd 714 raise ExtractorError(
78caa52a 715 '"token" parameter not in video info for unknown reason',
d11271dd 716 video_id=video_id)
c5e8d7af 717
1d699755
PH
718 if 'view_count' in video_info:
719 view_count = int(video_info['view_count'][0])
720 else:
721 view_count = None
722
c5e8d7af
PH
723 # Check for "rental" videos
724 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
69ea8ca4 725 raise ExtractorError('"rental" videos not supported')
c5e8d7af
PH
726
727 # Start extracting information
728 self.report_information_extraction(video_id)
729
730 # uploader
731 if 'author' not in video_info:
69ea8ca4 732 raise ExtractorError('Unable to extract uploader name')
c5e8d7af
PH
733 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
734
735 # uploader_id
736 video_uploader_id = None
737 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
738 if mobj is not None:
739 video_uploader_id = mobj.group(1)
740 else:
69ea8ca4 741 self._downloader.report_warning('unable to extract uploader nickname')
c5e8d7af
PH
742
743 # title
a8c6b241 744 if 'title' in video_info:
aa92f063 745 video_title = video_info['title'][0]
a8c6b241 746 else:
69ea8ca4 747 self._downloader.report_warning('Unable to extract video title')
78caa52a 748 video_title = '_'
c5e8d7af
PH
749
750 # thumbnail image
7763b04e
JMF
751 # We try first to get a high quality image:
752 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
753 video_webpage, re.DOTALL)
754 if m_thumb is not None:
755 video_thumbnail = m_thumb.group(1)
756 elif 'thumbnail_url' not in video_info:
69ea8ca4 757 self._downloader.report_warning('unable to extract video thumbnail')
f490e77e 758 video_thumbnail = None
c5e8d7af
PH
759 else: # don't panic if we can't find it
760 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
761
762 # upload date
763 upload_date = None
ad3bc6ac 764 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
beee53de
PH
765 if mobj is None:
766 mobj = re.search(
263bd4ec 767 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
beee53de 768 video_webpage)
c5e8d7af
PH
769 if mobj is not None:
770 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
771 upload_date = unified_strdate(upload_date)
772
55f7bd2d
PH
773 m_cat_container = self._search_regex(
774 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
775 video_webpage, 'categories', fatal=False)
ec8deefc 776 if m_cat_container:
ad3bc6ac 777 category = self._html_search_regex(
01ed5c9b 778 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
779 default=None)
780 video_categories = None if category is None else [category]
781 else:
782 video_categories = None
ec8deefc 783
c5e8d7af
PH
784 # description
785 video_description = get_element_by_id("eow-description", video_webpage)
786 if video_description:
27dcce19
PH
787 video_description = re.sub(r'''(?x)
788 <a\s+
789 (?:[a-zA-Z-]+="[^"]+"\s+)*?
790 title="([^"]+)"\s+
791 (?:[a-zA-Z-]+="[^"]+"\s+)*?
792 class="yt-uix-redirect-link"\s*>
793 [^<]+
794 </a>
795 ''', r'\1', video_description)
c5e8d7af
PH
796 video_description = clean_html(video_description)
797 else:
798 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
799 if fd_mobj:
800 video_description = unescapeHTML(fd_mobj.group(1))
801 else:
78caa52a 802 video_description = ''
c5e8d7af 803
f30a38be 804 def _extract_count(count_name):
46374a56 805 count = self._search_regex(
f30a38be
JMF
806 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
807 video_webpage, count_name, default=None)
336c3a69
JMF
808 if count is not None:
809 return int(count.replace(',', ''))
810 return None
69ea8ca4
PH
811 like_count = _extract_count('like')
812 dislike_count = _extract_count('dislike')
336c3a69 813
c5e8d7af 814 # subtitles
d82134c3 815 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 816
c5e8d7af 817 if self._downloader.params.get('listsubtitles', False):
d665f8d3 818 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
819 return
820
821 if 'length_seconds' not in video_info:
69ea8ca4 822 self._downloader.report_warning('unable to extract video duration')
b466b702 823 video_duration = None
c5e8d7af 824 else:
b466b702 825 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 826
1fb07d10
JG
827 # annotations
828 video_annotations = None
829 if self._downloader.params.get('writeannotations', False):
830 video_annotations = self._extract_annotations(video_id)
831
c5e8d7af 832 # Decide which formats to download
c5e8d7af 833 try:
ae7ed920 834 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
50be92c1
PH
835 if not mobj:
836 raise ValueError('Could not find vevo ID')
ae7ed920
PH
837 json_code = uppercase_escape(mobj.group(1))
838 ytplayer_config = json.loads(json_code)
3489b7d2 839 args = ytplayer_config['args']
7ce7e394
JMF
840 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
841 # this signatures are encrypted
44d46655 842 if 'url_encoded_fmt_stream_map' not in args:
69ea8ca4 843 raise ValueError('No stream_map present') # caught below
00fe14fc
JMF
844 re_signature = re.compile(r'[&,]s=')
845 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
7ce7e394 846 if m_s is not None:
69ea8ca4 847 self.to_screen('%s: Encrypted signatures detected.' % video_id)
c5e8d7af 848 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
78caa52a 849 m_s = re_signature.search(args.get('adaptive_fmts', ''))
b7a68384 850 if m_s is not None:
00fe14fc
JMF
851 if 'adaptive_fmts' in video_info:
852 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
37b6d5f6 853 else:
00fe14fc 854 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
c5e8d7af
PH
855 except ValueError:
856 pass
857
dd27fd17
PH
858 def _map_to_format_list(urlmap):
859 formats = []
860 for itag, video_real_url in urlmap.items():
861 dct = {
862 'format_id': itag,
863 'url': video_real_url,
864 'player_url': player_url,
865 }
0b65e5d4
PH
866 if itag in self._formats:
867 dct.update(self._formats[itag])
dd27fd17
PH
868 formats.append(dct)
869 return formats
870
c5e8d7af
PH
871 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
872 self.report_rtmp_download()
dd27fd17
PH
873 formats = [{
874 'format_id': '_rtmp',
875 'protocol': 'rtmp',
876 'url': video_info['conn'][0],
877 'player_url': player_url,
878 }]
00fe14fc
JMF
879 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
880 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
881 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 882 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 883 url_map = {}
00fe14fc 884 for url_data_str in encoded_url_map.split(','):
c5e8d7af 885 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
886 if 'itag' not in url_data or 'url' not in url_data:
887 continue
888 format_id = url_data['itag'][0]
889 url = url_data['url'][0]
890
891 if 'sig' in url_data:
892 url += '&signature=' + url_data['sig'][0]
893 elif 's' in url_data:
894 encrypted_sig = url_data['s'][0]
895
896 if not age_gate:
897 jsplayer_url_json = self._search_regex(
898 r'"assets":.+?"js":\s*("[^"]+")',
78caa52a 899 video_webpage, 'JS player URL')
201e9eaa
PH
900 player_url = json.loads(jsplayer_url_json)
901 if player_url is None:
902 player_url_json = self._search_regex(
903 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 904 video_webpage, 'age gate player URL')
201e9eaa
PH
905 player_url = json.loads(player_url_json)
906
907 if self._downloader.params.get('verbose'):
cf010131 908 if player_url is None:
201e9eaa
PH
909 player_version = 'unknown'
910 player_desc = 'unknown'
911 else:
912 if player_url.endswith('swf'):
913 player_version = self._search_regex(
914 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 915 'flash player', fatal=False)
201e9eaa 916 player_desc = 'flash player %s' % player_version
cf010131 917 else:
201e9eaa
PH
918 player_version = self._search_regex(
919 r'html5player-([^/]+?)(?:/html5player)?\.js',
920 player_url,
921 'html5 player', fatal=False)
78caa52a 922 player_desc = 'html5 player %s' % player_version
201e9eaa 923
60064c53 924 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 925 self.to_screen('{%s} signature length %s, %s' %
98eb1c3f 926 (format_id, parts_sizes, player_desc))
201e9eaa
PH
927
928 signature = self._decrypt_signature(
929 encrypted_sig, video_id, player_url, age_gate)
930 url += '&signature=' + signature
931 if 'ratebypass' not in url:
932 url += '&ratebypass=yes'
933 url_map[format_id] = url
dd27fd17 934 formats = _map_to_format_list(url_map)
1d043b93
JMF
935 elif video_info.get('hlsvp'):
936 manifest_url = video_info['hlsvp'][0]
937 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 938 formats = _map_to_format_list(url_map)
c5e8d7af 939 else:
69ea8ca4 940 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 941
dd27fd17 942 # Look for the DASH manifest
203fb43f 943 if self._downloader.params.get('youtube_include_dash_manifest', True):
dd27fd17 944 try:
d68f0cdb 945 # The DASH manifest used needs to be the one from the original video_webpage.
946 # The one found in get_video_info seems to be using different signatures.
947 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
948 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
949 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
950 if age_gate:
3489b7d2 951 dash_manifest_url = video_info.get('dashmpd')[0]
d68f0cdb 952 else:
3489b7d2 953 dash_manifest_url = ytplayer_config['args']['dashmpd']
d68f0cdb 954 def decrypt_sig(mobj):
955 s = mobj.group(1)
956 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
957 return '/signature/%s' % dec_s
958 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
dd27fd17 959 dash_doc = self._download_xml(
d68f0cdb 960 dash_manifest_url, video_id,
69ea8ca4
PH
961 note='Downloading DASH manifest',
962 errnote='Could not download DASH manifest')
963 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
dd27fd17
PH
964 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
965 if url_el is None:
966 continue
967 format_id = r.attrib['id']
968 video_url = url_el.text
969 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
970 f = {
971 'format_id': format_id,
972 'url': video_url,
973 'width': int_or_none(r.attrib.get('width')),
974 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
975 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
976 'filesize': filesize,
977 }
978 try:
979 existing_format = next(
980 fo for fo in formats
981 if fo['format_id'] == format_id)
982 except StopIteration:
983 f.update(self._formats.get(format_id, {}))
984 formats.append(f)
985 else:
986 existing_format.update(f)
987
988 except (ExtractorError, KeyError) as e:
69ea8ca4 989 self.report_warning('Skipping DASH manifest: %s' % e, video_id)
d80044c2 990
4bcc7bd1 991 self._sort_formats(formats)
4ea3be0a 992
993 return {
994 'id': video_id,
995 'uploader': video_uploader,
996 'uploader_id': video_uploader_id,
997 'upload_date': upload_date,
998 'title': video_title,
999 'thumbnail': video_thumbnail,
1000 'description': video_description,
ec8deefc 1001 'categories': video_categories,
4ea3be0a 1002 'subtitles': video_subtitles,
1003 'duration': video_duration,
1004 'age_limit': 18 if age_gate else 0,
1005 'annotations': video_annotations,
7e8c0af0 1006 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
4ea3be0a 1007 'view_count': view_count,
1008 'like_count': like_count,
1009 'dislike_count': dislike_count,
1010 'formats': formats,
1011 }
c5e8d7af 1012
880e1c52 1013class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
78caa52a 1014 IE_DESC = 'YouTube.com playlists'
d67cc9fa 1015 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1016 (?:https?://)?
1017 (?:\w+\.)?
1018 youtube\.com/
1019 (?:
ac7553d0 1020 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
c5e8d7af
PH
1021 \? (?:.*?&)*? (?:p|a|list)=
1022 | p/
1023 )
d67cc9fa 1024 (
7d568f5a 1025 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
d67cc9fa
JMF
1026 # Top tracks, they can also include dots
1027 |(?:MC)[\w\.]*
1028 )
c5e8d7af
PH
1029 .*
1030 |
7d568f5a 1031 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1032 )"""
dbb94fb0 1033 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dcbb4580 1034 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
dbb94fb0 1035 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
78caa52a 1036 IE_NAME = 'youtube:playlist'
81127aa5
PH
1037 _TESTS = [{
1038 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1039 'info_dict': {
1040 'title': 'ytdl test PL',
1041 },
1042 'playlist_count': 3,
9291475f
PH
1043 }, {
1044 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1045 'info_dict': {
1046 'title': 'YDL_Empty_List',
1047 },
1048 'playlist_count': 0,
1049 }, {
1050 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1051 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1052 'info_dict': {
1053 'title': '29C3: Not my department',
1054 },
1055 'playlist_count': 95,
1056 }, {
1057 'note': 'issue #673',
1058 'url': 'PLBB231211A4F62143',
1059 'info_dict': {
1060 'title': 'Team Fortress 2 (Class-based LP)',
1061 },
1062 'playlist_mincount': 26,
1063 }, {
1064 'note': 'Large playlist',
1065 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1066 'info_dict': {
1067 'title': 'Uploads from Cauchemar',
1068 },
1069 'playlist_mincount': 799,
1070 }, {
1071 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1072 'info_dict': {
1073 'title': 'YDL_safe_search',
1074 },
1075 'playlist_count': 2,
ac7553d0
PH
1076 }, {
1077 'note': 'embedded',
1078 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1079 'playlist_count': 4,
1080 'info_dict': {
1081 'title': 'JODA15',
1082 }
6b08cdf6
PH
1083 }, {
1084 'note': 'Embedded SWF player',
1085 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1086 'playlist_count': 4,
1087 'info_dict': {
1088 'title': 'JODA7',
1089 }
81127aa5 1090 }]
c5e8d7af 1091
880e1c52
JMF
1092 def _real_initialize(self):
1093 self._login()
1094
652cdaa2 1095 def _ids_to_results(self, ids):
c9cc0bf5
PH
1096 return [
1097 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1098 for vid_id in ids]
652cdaa2
JMF
1099
1100 def _extract_mix(self, playlist_id):
1101 # The mixes are generated from a a single video
1102 # the id of the playlist is just 'RD' + video_id
7d4afc55 1103 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
c9cc0bf5 1104 webpage = self._download_webpage(
78caa52a 1105 url, playlist_id, 'Downloading Youtube mix')
bc2f773b 1106 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
1107 title_span = (
1108 search_title('playlist-title') or
1109 search_title('title long-title') or
1110 search_title('title'))
76d1700b 1111 title = clean_html(title_span)
c9cc0bf5
PH
1112 ids = orderedSet(re.findall(
1113 r'''(?xs)data-video-username=".*?".*?
1114 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1115 webpage))
652cdaa2
JMF
1116 url_results = self._ids_to_results(ids)
1117
1118 return self.playlist_result(url_results, playlist_id, title)
1119
c5e8d7af
PH
1120 def _real_extract(self, url):
1121 # Extract playlist id
d67cc9fa 1122 mobj = re.match(self._VALID_URL, url)
c5e8d7af 1123 if mobj is None:
69ea8ca4 1124 raise ExtractorError('Invalid URL: %s' % url)
47192f92
FV
1125 playlist_id = mobj.group(1) or mobj.group(2)
1126
1127 # Check if it's a video-specific URL
7c61bd36 1128 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1129 if 'v' in query_dict:
1130 video_id = query_dict['v'][0]
1131 if self._downloader.params.get('noplaylist'):
69ea8ca4 1132 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1133 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92 1134 else:
69ea8ca4 1135 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1136
7d4afc55 1137 if playlist_id.startswith('RD'):
652cdaa2
JMF
1138 # Mixes require a custom extraction process
1139 return self._extract_mix(playlist_id)
0a688bc0 1140 if playlist_id.startswith('TL'):
69ea8ca4 1141 raise ExtractorError('For downloading YouTube.com top lists, use '
78caa52a 1142 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1143
dbb94fb0
S
1144 url = self._TEMPLATE_URL % playlist_id
1145 page = self._download_webpage(url, playlist_id)
1146 more_widget_html = content_html = page
1147
10c0e2d8 1148 # Check if the playlist exists or is private
e399853d 1149 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
10c0e2d8 1150 raise ExtractorError(
78caa52a 1151 'The playlist doesn\'t exist or is private, use --username or '
10c0e2d8
JMF
1152 '--netrc to access it.',
1153 expected=True)
1154
dcbb4580
JMF
1155 # Extract the video ids from the playlist pages
1156 ids = []
c5e8d7af 1157
755eb032 1158 for page_num in itertools.count(1):
dbb94fb0 1159 matches = re.finditer(self._VIDEO_RE, content_html)
6e47b51e
JMF
1160 # We remove the duplicates and the link with index 0
1161 # (it's not the first video of the playlist)
1162 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1163 ids.extend(new_ids)
c5e8d7af 1164
dbb94fb0
S
1165 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1166 if not mobj:
c5e8d7af
PH
1167 break
1168
dbb94fb0 1169 more = self._download_json(
5912c639
PH
1170 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1171 'Downloading page #%s' % page_num,
1172 transform_source=uppercase_escape)
dbb94fb0
S
1173 content_html = more['content_html']
1174 more_widget_html = more['load_more_widget_html']
1175
1176 playlist_title = self._html_search_regex(
68eb8e90 1177 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
78caa52a 1178 page, 'title')
c5e8d7af 1179
652cdaa2 1180 url_results = self._ids_to_results(ids)
dcbb4580 1181 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1182
1183
0a688bc0 1184class YoutubeTopListIE(YoutubePlaylistIE):
78caa52a 1185 IE_NAME = 'youtube:toplist'
69ea8ca4 1186 IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"'
78caa52a 1187 ' (Example: "yttoplist:music:Top Tracks")')
0a688bc0 1188 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
cdc628a4
PH
1189 _TESTS = [{
1190 'url': 'yttoplist:music:Trending',
1191 'playlist_mincount': 5,
1192 'skip': 'Only works for logged-in users',
1193 }]
0a688bc0
JMF
1194
1195 def _real_extract(self, url):
1196 mobj = re.match(self._VALID_URL, url)
1197 channel = mobj.group('chann')
1198 title = mobj.group('title')
1199 query = compat_urllib_parse.urlencode({'title': title})
cdc628a4
PH
1200 channel_page = self._download_webpage(
1201 'https://www.youtube.com/%s' % channel, title)
1202 link = self._html_search_regex(
1203 r'''(?x)
1204 <a\s+href="([^"]+)".*?>\s*
1205 <span\s+class="branded-page-module-title-text">\s*
1206 <span[^>]*>.*?%s.*?</span>''' % re.escape(query),
1207 channel_page, 'list')
0a688bc0
JMF
1208 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1209
1210 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1211 ids = []
1212 # sometimes the webpage doesn't contain the videos
1213 # retry until we get them
1214 for i in itertools.count(0):
78caa52a 1215 msg = 'Downloading Youtube mix'
0a688bc0
JMF
1216 if i > 0:
1217 msg += ', retry #%d' % i
c9cc0bf5 1218
0a688bc0
JMF
1219 webpage = self._download_webpage(url, title, msg)
1220 ids = orderedSet(re.findall(video_re, webpage))
1221 if ids:
1222 break
1223 url_results = self._ids_to_results(ids)
1224 return self.playlist_result(url_results, playlist_title=title)
1225
1226
c5e8d7af 1227class YoutubeChannelIE(InfoExtractor):
78caa52a 1228 IE_DESC = 'YouTube.com channels'
c5e8d7af 1229 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1230 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
38c2e5b8 1231 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
78caa52a 1232 IE_NAME = 'youtube:channel'
cdc628a4
PH
1233 _TESTS = [{
1234 'note': 'paginated channel',
1235 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1236 'playlist_mincount': 91,
1237 }]
c5e8d7af
PH
1238
1239 def extract_videos_from_page(self, page):
1240 ids_in_page = []
1241 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1242 if mobj.group(1) not in ids_in_page:
1243 ids_in_page.append(mobj.group(1))
1244 return ids_in_page
1245
1246 def _real_extract(self, url):
1247 # Extract channel id
1248 mobj = re.match(self._VALID_URL, url)
1249 if mobj is None:
69ea8ca4 1250 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
1251
1252 # Download channel page
1253 channel_id = mobj.group(1)
1254 video_ids = []
b9643eed
JMF
1255 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1256 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1257 autogenerated = re.search(r'''(?x)
1258 class="[^"]*?(?:
1259 channel-header-autogenerated-label|
1260 yt-channel-title-autogenerated
1261 )[^"]*"''', channel_page) is not None
c5e8d7af 1262
b9643eed
JMF
1263 if autogenerated:
1264 # The videos are contained in a single page
1265 # the ajax pages can't be used, they are empty
1266 video_ids = self.extract_videos_from_page(channel_page)
1267 else:
1268 # Download all channel pages using the json-based channel_ajax query
1269 for pagenum in itertools.count(1):
1270 url = self._MORE_PAGES_URL % (pagenum, channel_id)
81c2f20b 1271 page = self._download_json(
69ea8ca4 1272 url, channel_id, note='Downloading page #%s' % pagenum,
81c2f20b
PH
1273 transform_source=uppercase_escape)
1274
b9643eed
JMF
1275 ids_in_page = self.extract_videos_from_page(page['content_html'])
1276 video_ids.extend(ids_in_page)
1277
1278 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1279 break
c5e8d7af 1280
69ea8ca4 1281 self._downloader.to_screen('[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
c5e8d7af 1282
7012b23c
PH
1283 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1284 for video_id in video_ids]
1285 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1286
1287
1288class YoutubeUserIE(InfoExtractor):
78caa52a 1289 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
c9ae7b95 1290 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
38c2e5b8 1291 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
c5e8d7af 1292 _GDATA_PAGE_SIZE = 50
38c2e5b8 1293 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
78caa52a 1294 IE_NAME = 'youtube:user'
c5e8d7af 1295
cdc628a4
PH
1296 _TESTS = [{
1297 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1298 'playlist_mincount': 320,
1299 'info_dict': {
1300 'title': 'TheLinuxFoundation',
1301 }
1302 }, {
1303 'url': 'ytuser:phihag',
1304 'only_matching': True,
1305 }]
1306
e3ea4790 1307 @classmethod
f4b05232 1308 def suitable(cls, url):
e3ea4790
JMF
1309 # Don't return True if the url can be extracted with other youtube
1310 # extractor, the regex would is too permissive and it would match.
1311 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1312 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1313 else: return super(YoutubeUserIE, cls).suitable(url)
1314
c5e8d7af
PH
1315 def _real_extract(self, url):
1316 # Extract username
1317 mobj = re.match(self._VALID_URL, url)
1318 if mobj is None:
69ea8ca4 1319 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
1320
1321 username = mobj.group(1)
1322
1323 # Download video ids using YouTube Data API. Result size per
1324 # query is limited (currently to 50 videos) so we need to query
1325 # page by page until there are no video ids - it means we got
1326 # all of them.
1327
b7ab0590 1328 def download_page(pagenum):
c5e8d7af
PH
1329 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1330
1331 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1332 page = self._download_webpage(
1333 gdata_url, username,
78caa52a 1334 'Downloading video ids from %d to %d' % (
b7ab0590 1335 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1336
fd9cf738
JMF
1337 try:
1338 response = json.loads(page)
1339 except ValueError as err:
69ea8ca4 1340 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
71c82637 1341 if 'entry' not in response['feed']:
b7ab0590 1342 return
fd9cf738 1343
c5e8d7af 1344 # Extract video identifiers
e302f9ce
PH
1345 entries = response['feed']['entry']
1346 for entry in entries:
1347 title = entry['title']['$t']
1348 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1349 yield {
e302f9ce
PH
1350 '_type': 'url',
1351 'url': video_id,
1352 'ie_key': 'Youtube',
b11cec41 1353 'id': video_id,
e302f9ce 1354 'title': title,
b7ab0590 1355 }
9c44d242 1356 url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1357
7012b23c
PH
1358 return self.playlist_result(url_results, playlist_title=username)
1359
b05654f0
PH
1360
1361class YoutubeSearchIE(SearchInfoExtractor):
78caa52a
PH
1362 IE_DESC = 'YouTube.com searches'
1363 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
b05654f0 1364 _MAX_RESULTS = 1000
78caa52a 1365 IE_NAME = 'youtube:search'
b05654f0
PH
1366 _SEARCH_KEY = 'ytsearch'
1367
b05654f0
PH
1368 def _get_n_results(self, query, n):
1369 """Get a specified number of results for a query"""
1370
1371 video_ids = []
1372 pagenum = 0
1373 limit = n
83d548ef 1374 PAGE_SIZE = 50
b05654f0 1375
83d548ef
PH
1376 while (PAGE_SIZE * pagenum) < limit:
1377 result_url = self._API_URL % (
1378 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1379 (PAGE_SIZE * pagenum) + 1)
7cc3570e 1380 data_json = self._download_webpage(
69ea8ca4
PH
1381 result_url, video_id='query "%s"' % query,
1382 note='Downloading page %s' % (pagenum + 1),
1383 errnote='Unable to download API page')
7cc3570e
PH
1384 data = json.loads(data_json)
1385 api_response = data['data']
1386
1387 if 'items' not in api_response:
07ad22b8 1388 raise ExtractorError(
78caa52a 1389 '[youtube] No video results', expected=True)
b05654f0
PH
1390
1391 new_ids = list(video['id'] for video in api_response['items'])
1392 video_ids += new_ids
1393
1394 limit = min(n, api_response['totalItems'])
1395 pagenum += 1
1396
1397 if len(video_ids) > n:
1398 video_ids = video_ids[:n]
7012b23c
PH
1399 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1400 for video_id in video_ids]
b05654f0 1401 return self.playlist_result(videos, query)
75dff0ee 1402
c9ae7b95 1403
a3dd9248 1404class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1405 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1406 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1407 _SEARCH_KEY = 'ytsearchdate'
78caa52a 1408 IE_DESC = 'YouTube.com searches, newest videos first'
75dff0ee 1409
c9ae7b95
PH
1410
1411class YoutubeSearchURLIE(InfoExtractor):
78caa52a
PH
1412 IE_DESC = 'YouTube.com search URLs'
1413 IE_NAME = 'youtube:search_url'
c9ae7b95 1414 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
1415 _TESTS = [{
1416 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1417 'playlist_mincount': 5,
1418 'info_dict': {
1419 'title': 'youtube-dl test video',
1420 }
1421 }]
c9ae7b95
PH
1422
1423 def _real_extract(self, url):
1424 mobj = re.match(self._VALID_URL, url)
1425 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1426
1427 webpage = self._download_webpage(url, query)
1428 result_code = self._search_regex(
78caa52a 1429 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
c9ae7b95
PH
1430
1431 part_codes = re.findall(
1432 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1433 entries = []
1434 for part_code in part_codes:
1435 part_title = self._html_search_regex(
6feb2d5e 1436 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1437 part_url_snippet = self._html_search_regex(
1438 r'(?s)href="([^"]+)"', part_code, 'item URL')
1439 part_url = compat_urlparse.urljoin(
1440 'https://www.youtube.com/', part_url_snippet)
1441 entries.append({
1442 '_type': 'url',
1443 'url': part_url,
1444 'title': part_title,
1445 })
1446
1447 return {
1448 '_type': 'playlist',
1449 'entries': entries,
1450 'title': query,
1451 }
1452
1453
75dff0ee 1454class YoutubeShowIE(InfoExtractor):
78caa52a 1455 IE_DESC = 'YouTube.com (multi-season) shows'
cdc628a4 1456 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 1457 IE_NAME = 'youtube:show'
cdc628a4
PH
1458 _TESTS = [{
1459 'url': 'http://www.youtube.com/show/airdisasters',
1460 'playlist_mincount': 3,
1461 'info_dict': {
1462 'id': 'airdisasters',
1463 'title': 'Air Disasters',
1464 }
1465 }]
75dff0ee
JMF
1466
1467 def _real_extract(self, url):
1468 mobj = re.match(self._VALID_URL, url)
cdc628a4
PH
1469 playlist_id = mobj.group('id')
1470 webpage = self._download_webpage(
1471 url, playlist_id, 'Downloading show webpage')
75dff0ee
JMF
1472 # There's one playlist for each season of the show
1473 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
cdc628a4
PH
1474 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1475 entries = [
1476 self.url_result(
1477 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1478 for season in m_seasons
1479 ]
1480 title = self._og_search_title(webpage, fatal=False)
1481
1482 return {
1483 '_type': 'playlist',
1484 'id': playlist_id,
1485 'title': title,
1486 'entries': entries,
1487 }
04cc9617
JMF
1488
1489
b2e8bc1b 1490class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1491 """
1492 Base class for extractors that fetch info from
1493 http://www.youtube.com/feed_ajax
1494 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1495 """
b2e8bc1b 1496 _LOGIN_REQUIRED = True
43ba5456
JMF
1497 # use action_load_personal_feed instead of action_load_system_feed
1498 _PERSONAL_FEED = False
04cc9617 1499
d7ae0639
JMF
1500 @property
1501 def _FEED_TEMPLATE(self):
43ba5456
JMF
1502 action = 'action_load_system_feed'
1503 if self._PERSONAL_FEED:
1504 action = 'action_load_personal_feed'
38c2e5b8 1505 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1506
1507 @property
1508 def IE_NAME(self):
78caa52a 1509 return 'youtube:%s' % self._FEED_NAME
04cc9617 1510
81f0259b 1511 def _real_initialize(self):
b2e8bc1b 1512 self._login()
81f0259b 1513
04cc9617
JMF
1514 def _real_extract(self, url):
1515 feed_entries = []
0e44d838
JMF
1516 paging = 0
1517 for i in itertools.count(1):
f6177462 1518 info = self._download_json(self._FEED_TEMPLATE % paging,
78caa52a
PH
1519 '%s feed' % self._FEED_NAME,
1520 'Downloading page %s' % i)
f6177462 1521 feed_html = info.get('feed_html') or info.get('content_html')
1a9b9649 1522 load_more_widget_html = info.get('load_more_widget_html') or feed_html
43ba5456 1523 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1524 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1525 feed_entries.extend(
1526 self.url_result(video_id, 'Youtube', video_id=video_id)
1527 for video_id in ids)
05ee2b6d
JMF
1528 mobj = re.search(
1529 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1a9b9649 1530 load_more_widget_html)
05ee2b6d 1531 if mobj is None:
04cc9617 1532 break
05ee2b6d 1533 paging = mobj.group('paging')
d7ae0639
JMF
1534 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1535
d7ae0639 1536class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
78caa52a 1537 IE_DESC = 'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
d7ae0639
JMF
1538 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1539 _FEED_NAME = 'recommended'
78caa52a 1540 _PLAYLIST_TITLE = 'Youtube Recommended videos'
c626a3d9 1541
43ba5456 1542class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
78caa52a 1543 IE_DESC = 'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
43ba5456
JMF
1544 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1545 _FEED_NAME = 'watch_later'
78caa52a 1546 _PLAYLIST_TITLE = 'Youtube Watch Later'
43ba5456 1547 _PERSONAL_FEED = True
c626a3d9 1548
f459d170 1549class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
78caa52a
PH
1550 IE_DESC = 'Youtube watch history, "ythistory" keyword (requires authentication)'
1551 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
f459d170
JMF
1552 _FEED_NAME = 'history'
1553 _PERSONAL_FEED = True
78caa52a 1554 _PLAYLIST_TITLE = 'Youtube Watch History'
f459d170 1555
c626a3d9 1556class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a
PH
1557 IE_NAME = 'youtube:favorites'
1558 IE_DESC = 'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1559 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1560 _LOGIN_REQUIRED = True
1561
1562 def _real_extract(self, url):
1563 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 1564 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 1565 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1566
1567
1ed5b5c9 1568class YoutubeSubscriptionsIE(YoutubePlaylistIE):
78caa52a
PH
1569 IE_NAME = 'youtube:subscriptions'
1570 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1ed5b5c9 1571 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
81127aa5 1572 _TESTS = []
1ed5b5c9
JMF
1573
1574 def _real_extract(self, url):
78caa52a 1575 title = 'Youtube Subscriptions'
1ed5b5c9
JMF
1576 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1577
1578 # The extraction process is the same as for playlists, but the regex
1579 # for the video ids doesn't contain an index
1580 ids = []
1581 more_widget_html = content_html = page
1582
1583 for page_num in itertools.count(1):
1584 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1585 new_ids = orderedSet(matches)
1586 ids.extend(new_ids)
1587
1588 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1589 if not mobj:
1590 break
1591
1592 more = self._download_json(
1593 'https://youtube.com/%s' % mobj.group('more'), title,
1594 'Downloading page #%s' % page_num,
1595 transform_source=uppercase_escape)
1596 content_html = more['content_html']
1597 more_widget_html = more['load_more_widget_html']
1598
1599 return {
1600 '_type': 'playlist',
1601 'title': title,
1602 'entries': self._ids_to_results(ids),
1603 }
1604
1605
15870e90
PH
1606class YoutubeTruncatedURLIE(InfoExtractor):
1607 IE_NAME = 'youtube:truncated_url'
1608 IE_DESC = False # Do not list
975d35db 1609 _VALID_URL = r'''(?x)
c4808c60
PH
1610 (?:https?://)?[^/]+/watch\?(?:
1611 feature=[a-z_]+|
1612 annotation_id=annotation_[^&]+
1613 )?$|
975d35db
PH
1614 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1615 '''
15870e90 1616
c4808c60
PH
1617 _TESTS = [{
1618 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1619 'only_matching': True,
dc2fc736
PH
1620 }, {
1621 'url': 'http://www.youtube.com/watch?',
1622 'only_matching': True,
c4808c60
PH
1623 }]
1624
15870e90
PH
1625 def _real_extract(self, url):
1626 raise ExtractorError(
78caa52a
PH
1627 'Did you forget to quote the URL? Remember that & is a meta '
1628 'character in most shells, so you want to put the URL in quotes, '
1629 'like youtube-dl '
1630 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1631 ' or simply youtube-dl BaW_jenozKc .',
15870e90 1632 expected=True)