]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
release 2014.10.30
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
e0df6211 10import traceback
c5e8d7af 11
b05654f0 12from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 13from .subtitles import SubtitlesInfoExtractor
2b25cb5d 14from ..jsinterp import JSInterpreter
54256267 15from ..swfinterp import SWFInterpreter
c5e8d7af 16from ..utils import (
edf3e38e 17 compat_chr,
c5e8d7af 18 compat_parse_qs,
c5e8d7af
PH
19 compat_urllib_parse,
20 compat_urllib_request,
7c61bd36 21 compat_urlparse,
c5e8d7af
PH
22 compat_str,
23
24 clean_html,
25 get_element_by_id,
652cdaa2 26 get_element_by_attribute,
c5e8d7af 27 ExtractorError,
dd27fd17 28 int_or_none,
9c44d242 29 OnDemandPagedList,
c5e8d7af
PH
30 unescapeHTML,
31 unified_strdate,
04cc9617 32 orderedSet,
81c2f20b 33 uppercase_escape,
c5e8d7af
PH
34)
35
de7f3446 36class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
37 """Provide base functions for Youtube extractors"""
38 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
83317f69 39 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
b2e8bc1b 40 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
38c2e5b8 41 _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
b2e8bc1b
JMF
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
45
b2e8bc1b 46 def _set_language(self):
7cc3570e
PH
47 return bool(self._download_webpage(
48 self._LANG_URL, None,
69ea8ca4 49 note='Setting language', errnote='unable to set language',
7cc3570e 50 fatal=False))
b2e8bc1b
JMF
51
52 def _login(self):
83317f69 53 """
54 Attempt to log in to YouTube.
55 True is returned if successful or skipped.
56 False is returned if login failed.
57
58 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
59 """
b2e8bc1b
JMF
60 (username, password) = self._get_login_info()
61 # No authentication to be performed
62 if username is None:
63 if self._LOGIN_REQUIRED:
69ea8ca4 64 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 65 return True
b2e8bc1b 66
7cc3570e
PH
67 login_page = self._download_webpage(
68 self._LOGIN_URL, None,
69ea8ca4
PH
69 note='Downloading login page',
70 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
71 if login_page is False:
72 return
b2e8bc1b 73
795f28f8 74 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78caa52a 75 login_page, 'Login GALX parameter')
c5e8d7af 76
b2e8bc1b
JMF
77 # Log in
78 login_form_strs = {
78caa52a
PH
79 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
80 'Email': username,
81 'GALX': galx,
82 'Passwd': password,
83
84 'PersistentCookie': 'yes',
85 '_utf8': '霱',
86 'bgresponse': 'js_disabled',
87 'checkConnection': '',
88 'checkedDomains': 'youtube',
89 'dnConn': '',
90 'pstMsg': '0',
91 'rmShown': '1',
92 'secTok': '',
93 'signIn': 'Sign in',
94 'timeStmp': '',
95 'service': 'youtube',
96 'uilel': '3',
97 'hl': 'en_US',
b2e8bc1b 98 }
83317f69 99
b2e8bc1b
JMF
100 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
101 # chokes on unicode
102 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
103 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
104
105 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
106 login_results = self._download_webpage(
107 req, None,
69ea8ca4 108 note='Logging in', errnote='unable to log in', fatal=False)
7cc3570e
PH
109 if login_results is False:
110 return False
83317f69 111
112 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
69ea8ca4 113 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
83317f69 114
115 # Two-Factor
116 # TODO add SMS and phone call support - these require making a request and then prompting the user
117
118 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
119 tfa_code = self._get_tfa_info()
120
121 if tfa_code is None:
69ea8ca4
PH
122 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
123 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
83317f69 124 return False
125
126 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
127
128 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
129 if match is None:
69ea8ca4 130 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
83317f69 131 secTok = match.group(1)
132 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
133 if match is None:
69ea8ca4 134 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
83317f69 135 timeStmp = match.group(1)
136
137 tfa_form_strs = {
78caa52a
PH
138 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
139 'smsToken': '',
140 'smsUserPin': tfa_code,
141 'smsVerifyPin': 'Verify',
142
143 'PersistentCookie': 'yes',
144 'checkConnection': '',
145 'checkedDomains': 'youtube',
146 'pstMsg': '1',
147 'secTok': secTok,
148 'timeStmp': timeStmp,
149 'service': 'youtube',
150 'hl': 'en_US',
83317f69 151 }
152 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in tfa_form_strs.items())
153 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
154
155 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
156 tfa_results = self._download_webpage(
157 tfa_req, None,
69ea8ca4 158 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
83317f69 159
160 if tfa_results is False:
161 return False
162
163 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
69ea8ca4 164 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
83317f69 165 return False
166 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
69ea8ca4 167 self._downloader.report_warning('unable to log in - did the page structure change?')
83317f69 168 return False
169 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
69ea8ca4 170 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
83317f69 171 return False
172
7cc3570e 173 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
69ea8ca4 174 self._downloader.report_warning('unable to log in: bad username or password')
b2e8bc1b
JMF
175 return False
176 return True
177
178 def _confirm_age(self):
179 age_form = {
7cc3570e
PH
180 'next_url': '/',
181 'action_confirm': 'Confirm',
182 }
5700e779
JMF
183 req = compat_urllib_request.Request(self._AGE_URL,
184 compat_urllib_parse.urlencode(age_form).encode('ascii'))
7cc3570e
PH
185
186 self._download_webpage(
187 req, None,
bfc2bedc
PH
188 note='Confirming age', errnote='Unable to confirm age',
189 fatal=False)
b2e8bc1b
JMF
190
191 def _real_initialize(self):
192 if self._downloader is None:
193 return
6b445558
PH
194 if self._get_login_info()[0] is not None:
195 if not self._set_language():
196 return
b2e8bc1b
JMF
197 if not self._login():
198 return
199 self._confirm_age()
c5e8d7af 200
8377574c 201
de7f3446 202class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
78caa52a 203 IE_DESC = 'YouTube.com'
cb7dfeea 204 _VALID_URL = r"""(?x)^
c5e8d7af 205 (
edb53e2d 206 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 207 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 208 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 209 (?:www\.)?pwnyoutube\.com/|
f7000f3a 210 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
211 tube\.majestyc\.net/|
212 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
213 (?:.*?\#/)? # handle anchor (#/) redirect urls
214 (?: # the various things that can precede the ID:
ac7553d0 215 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 216 |(?: # or the v= param in all its forms
f7000f3a 217 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
218 (?:\?|\#!?) # the params delimiter ? or # or #!
219 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
220 v=
221 )
f4b05232
JMF
222 ))
223 |youtu\.be/ # just youtu.be/xxxx
edb53e2d 224 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 225 )
c5e8d7af 226 )? # all until now is optional -> you can pass the naked ID
8963d9c2 227 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
9291475f 228 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
c5e8d7af
PH
229 (?(1).+)? # if we found the ID, everything can follow
230 $"""
c5e8d7af 231 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
232 _formats = {
233 '5': {'ext': 'flv', 'width': 400, 'height': 240},
234 '6': {'ext': 'flv', 'width': 450, 'height': 270},
235 '13': {'ext': '3gp'},
236 '17': {'ext': '3gp', 'width': 176, 'height': 144},
237 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
238 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
239 '34': {'ext': 'flv', 'width': 640, 'height': 360},
240 '35': {'ext': 'flv', 'width': 854, 'height': 480},
241 '36': {'ext': '3gp', 'width': 320, 'height': 240},
242 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
243 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
244 '43': {'ext': 'webm', 'width': 640, 'height': 360},
245 '44': {'ext': 'webm', 'width': 854, 'height': 480},
246 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
247 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
248
1d043b93 249
86fe61c8 250 # 3d videos
43b81eb9
PH
251 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
252 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
253 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
254 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
255 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
256 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
257 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 258
96fb5605 259 # Apple HTTP Live Streaming
43b81eb9
PH
260 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
261 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
262 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
263 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
264 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
265 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
266 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
267
268 # DASH mp4 video
43b81eb9
PH
269 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
270 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
271 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
272 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
273 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
274 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
275 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
276 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
836a086c 277
f6f1fc92 278 # Dash mp4 audio
2c62dc26
PH
279 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
280 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
281 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
282
283 # Dash webm
e75cafe9
A
284 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
285 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
286 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
287 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
288 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
289 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
1cc887cb 290 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
e75cafe9
A
291 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
292 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
293 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
294 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
295 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
296 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
297 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 298 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 299 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
a51d3aa0
PH
300 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
301 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
2c62dc26
PH
302
303 # Dash webm audio
55db73ef 304 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
e75cafe9 305 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d 306
fbb21cf5
PH
307 # Dash mov
308 '298': {'ext': 'mov', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
309 '299': {'ext': 'mov', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
7b6de372 310 '266': {'ext': 'mov', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
fbb21cf5 311
ce6b9a2d
PH
312 # RTMP (unnamed)
313 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 314 }
836a086c 315
78caa52a 316 IE_NAME = 'youtube'
2eb88d95
PH
317 _TESTS = [
318 {
4bc3a23e
PH
319 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
320 'info_dict': {
321 'id': 'BaW_jenozKc',
322 'ext': 'mp4',
323 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
324 'uploader': 'Philipp Hagemeister',
325 'uploader_id': 'phihag',
326 'upload_date': '20121002',
327 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
328 'categories': ['Science & Technology'],
3e7c1224
PH
329 'like_count': int,
330 'dislike_count': int,
2eb88d95 331 }
0e853ca4 332 },
0e853ca4 333 {
4bc3a23e
PH
334 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
335 'note': 'Test generic use_cipher_signature video (#897)',
336 'info_dict': {
337 'id': 'UxxajLWwzqY',
338 'ext': 'mp4',
339 'upload_date': '20120506',
340 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
341 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
342 'uploader': 'Icona Pop',
343 'uploader_id': 'IconaPop',
2eb88d95 344 }
c108eb73
JMF
345 },
346 {
4bc3a23e
PH
347 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
348 'note': 'Test VEVO video with age protection (#956)',
349 'info_dict': {
350 'id': '07FYdnEawAQ',
351 'ext': 'mp4',
352 'upload_date': '20130703',
353 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
354 'description': 'md5:64249768eec3bc4276236606ea996373',
355 'uploader': 'justintimberlakeVEVO',
356 'uploader_id': 'justintimberlakeVEVO',
c108eb73
JMF
357 }
358 },
fccd3771 359 {
4bc3a23e
PH
360 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
361 'note': 'Embed-only video (#1746)',
362 'info_dict': {
363 'id': 'yZIXLfi8CZQ',
364 'ext': 'mp4',
365 'upload_date': '20120608',
366 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
367 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
368 'uploader': 'SET India',
369 'uploader_id': 'setindia'
fccd3771
PH
370 }
371 },
dd27fd17 372 {
4bc3a23e
PH
373 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
374 'note': '256k DASH audio (format 141) via DASH manifest',
375 'info_dict': {
376 'id': 'a9LDPn-MO4I',
377 'ext': 'm4a',
378 'upload_date': '20121002',
379 'uploader_id': '8KVIDEO',
380 'description': '',
381 'uploader': '8KVIDEO',
382 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 383 },
4bc3a23e
PH
384 'params': {
385 'youtube_include_dash_manifest': True,
386 'format': '141',
4919603f 387 },
dd27fd17 388 },
3489b7d2
JMF
389 # DASH manifest with encrypted signature
390 {
78caa52a
PH
391 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
392 'info_dict': {
393 'id': 'IB3lcPjvWLA',
394 'ext': 'm4a',
395 'title': 'Afrojack - The Spark ft. Spree Wilson',
396 'description': 'md5:9717375db5a9a3992be4668bbf3bc0a8',
397 'uploader': 'AfrojackVEVO',
398 'uploader_id': 'AfrojackVEVO',
399 'upload_date': '20131011',
3489b7d2 400 },
4bc3a23e 401 'params': {
78caa52a
PH
402 'youtube_include_dash_manifest': True,
403 'format': '141',
3489b7d2
JMF
404 },
405 },
2eb88d95
PH
406 ]
407
e0df6211
PH
408 def __init__(self, *args, **kwargs):
409 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 410 self._player_cache = {}
e0df6211 411
c5e8d7af
PH
412 def report_video_info_webpage_download(self, video_id):
413 """Report attempt to download video info webpage."""
69ea8ca4 414 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 415
c5e8d7af
PH
416 def report_information_extraction(self, video_id):
417 """Report attempt to extract video information."""
69ea8ca4 418 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
419
420 def report_unavailable_format(self, video_id, format):
421 """Report extracted video URL."""
69ea8ca4 422 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
423
424 def report_rtmp_download(self):
425 """Indicate the download will use the RTMP protocol."""
69ea8ca4 426 self.to_screen('RTMP download detected')
c5e8d7af 427
60064c53
PH
428 def _signature_cache_id(self, example_sig):
429 """ Return a string representation of a signature """
78caa52a 430 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
431
432 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 433 id_m = re.match(
c081b35c 434 r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
cf010131 435 player_url)
c081b35c
PH
436 if not id_m:
437 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
438 player_type = id_m.group('ext')
439 player_id = id_m.group('id')
440
c4417ddb 441 # Read from filesystem cache
60064c53
PH
442 func_id = '%s_%s_%s' % (
443 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 444 assert os.path.basename(func_id) == func_id
a0e07d31 445
69ea8ca4 446 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 447 if cache_spec is not None:
78caa52a 448 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 449
e0df6211
PH
450 if player_type == 'js':
451 code = self._download_webpage(
452 player_url, video_id,
69ea8ca4
PH
453 note='Downloading %s player %s' % (player_type, player_id),
454 errnote='Download of %s failed' % player_url)
83799698 455 res = self._parse_sig_js(code)
c4417ddb 456 elif player_type == 'swf':
e0df6211
PH
457 urlh = self._request_webpage(
458 player_url, video_id,
69ea8ca4
PH
459 note='Downloading %s player %s' % (player_type, player_id),
460 errnote='Download of %s failed' % player_url)
e0df6211 461 code = urlh.read()
83799698 462 res = self._parse_sig_swf(code)
e0df6211
PH
463 else:
464 assert False, 'Invalid player type %r' % player_type
465
a0e07d31 466 if cache_spec is None:
78caa52a 467 test_string = ''.join(map(compat_chr, range(len(example_sig))))
a0e07d31
PH
468 cache_res = res(test_string)
469 cache_spec = [ord(c) for c in cache_res]
83799698 470
69ea8ca4 471 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
472 return res
473
60064c53 474 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
475 def gen_sig_code(idxs):
476 def _genslice(start, end, step):
78caa52a 477 starts = '' if start == 0 else str(start)
69ea8ca4
PH
478 ends = (':%d' % (end+step)) if end + step >= 0 else ':'
479 steps = '' if step == 1 else (':%d' % step)
78caa52a 480 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
481
482 step = None
0ca96d48
PH
483 start = '(Never used)' # Quelch pyflakes warnings - start will be
484 # set as soon as step is set
edf3e38e
PH
485 for i, prev in zip(idxs[1:], idxs[:-1]):
486 if step is not None:
487 if i - prev == step:
488 continue
489 yield _genslice(start, prev, step)
490 step = None
491 continue
492 if i - prev in [-1, 1]:
493 step = i - prev
494 start = prev
495 continue
496 else:
78caa52a 497 yield 's[%d]' % prev
edf3e38e 498 if step is None:
78caa52a 499 yield 's[%d]' % i
edf3e38e
PH
500 else:
501 yield _genslice(start, i, step)
502
78caa52a 503 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 504 cache_res = func(test_string)
edf3e38e 505 cache_spec = [ord(c) for c in cache_res]
78caa52a 506 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
507 signature_id_tuple = '(%s)' % (
508 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 509 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 510 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 511 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 512
e0df6211
PH
513 def _parse_sig_js(self, jscode):
514 funcname = self._search_regex(
c26e9ac4 515 r'signature=([$a-zA-Z]+)', jscode,
78caa52a 516 'Initial JS player signature function name')
2b25cb5d
PH
517
518 jsi = JSInterpreter(jscode)
519 initial_function = jsi.extract_function(funcname)
e0df6211
PH
520 return lambda s: initial_function([s])
521
522 def _parse_sig_swf(self, file_contents):
54256267 523 swfi = SWFInterpreter(file_contents)
78caa52a 524 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 525 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 526 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
527 return lambda s: initial_function([s])
528
83799698 529 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 530 """Turn the encrypted s field into a working signature"""
6b37f0be 531
c8bf86d5 532 if player_url is None:
69ea8ca4 533 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 534
69ea8ca4 535 if player_url.startswith('//'):
78caa52a 536 player_url = 'https:' + player_url
c8bf86d5 537 try:
62af3a0e 538 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
539 if player_id not in self._player_cache:
540 func = self._extract_signature_function(
60064c53 541 video_id, player_url, s
c8bf86d5
PH
542 )
543 self._player_cache[player_id] = func
544 func = self._player_cache[player_id]
545 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 546 self._print_sig_code(func, s)
c8bf86d5
PH
547 return func(s)
548 except Exception as e:
549 tb = traceback.format_exc()
550 raise ExtractorError(
78caa52a 551 'Signature extraction failed: ' + tb, cause=e)
e0df6211 552
1f343eaa 553 def _get_available_subtitles(self, video_id, webpage):
de7f3446 554 try:
7fad1c63 555 sub_list = self._download_webpage(
38c2e5b8 556 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
557 video_id, note=False)
558 except ExtractorError as err:
69ea8ca4 559 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
de7f3446
JMF
560 return {}
561 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
562
563 sub_lang_list = {}
564 for l in lang_list:
565 lang = l[1]
7e660ac1
LD
566 if lang in sub_lang_list:
567 continue
de7f3446
JMF
568 params = compat_urllib_parse.urlencode({
569 'lang': lang,
570 'v': video_id,
ca715127 571 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
c3197e3e 572 'name': unescapeHTML(l[0]).encode('utf-8'),
de7f3446 573 })
78caa52a 574 url = 'https://www.youtube.com/api/timedtext?' + params
de7f3446
JMF
575 sub_lang_list[lang] = url
576 if not sub_lang_list:
69ea8ca4 577 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
578 return {}
579 return sub_lang_list
580
055e6f36 581 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
582 """We need the webpage for getting the captions url, pass it as an
583 argument to speed up the process."""
ca715127 584 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
69ea8ca4 585 self.to_screen('%s: Looking for automatic captions' % video_id)
de7f3446 586 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
78caa52a 587 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
588 if mobj is None:
589 self._downloader.report_warning(err_msg)
590 return {}
591 player_config = json.loads(mobj.group(1))
592 try:
593 args = player_config[u'args']
594 caption_url = args[u'ttsurl']
595 timestamp = args[u'timestamp']
055e6f36
JMF
596 # We get the available subtitles
597 list_params = compat_urllib_parse.urlencode({
598 'type': 'list',
599 'tlangs': 1,
600 'asrs': 1,
de7f3446 601 })
055e6f36 602 list_url = caption_url + '&' + list_params
e26f8712 603 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 604 original_lang_node = caption_list.find('track')
f6a54188 605 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
69ea8ca4 606 self._downloader.report_warning('Video doesn\'t have automatic captions')
e3dc22ca
JMF
607 return {}
608 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
609
610 sub_lang_list = {}
611 for lang_node in caption_list.findall('target'):
612 sub_lang = lang_node.attrib['lang_code']
613 params = compat_urllib_parse.urlencode({
614 'lang': original_lang,
615 'tlang': sub_lang,
616 'fmt': sub_format,
617 'ts': timestamp,
618 'kind': 'asr',
619 })
620 sub_lang_list[sub_lang] = caption_url + '&' + params
621 return sub_lang_list
de7f3446
JMF
622 # An extractor error can be raise by the download process if there are
623 # no automatic captions but there are subtitles
624 except (KeyError, ExtractorError):
625 self._downloader.report_warning(err_msg)
626 return {}
627
97665381
PH
628 @classmethod
629 def extract_id(cls, url):
630 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 631 if mobj is None:
69ea8ca4 632 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
633 video_id = mobj.group(2)
634 return video_id
635
1d043b93
JMF
636 def _extract_from_m3u8(self, manifest_url, video_id):
637 url_map = {}
638 def _get_urls(_manifest):
639 lines = _manifest.split('\n')
640 urls = filter(lambda l: l and not l.startswith('#'),
641 lines)
642 return urls
78caa52a 643 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
1d043b93
JMF
644 formats_urls = _get_urls(manifest)
645 for format_url in formats_urls:
890f62e8 646 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
647 url_map[itag] = format_url
648 return url_map
649
1fb07d10
JG
650 def _extract_annotations(self, video_id):
651 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 652 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 653
c5e8d7af 654 def _real_extract(self, url):
7e8c0af0 655 proto = (
78caa52a
PH
656 'http' if self._downloader.params.get('prefer_insecure', False)
657 else 'https')
7e8c0af0 658
c5e8d7af
PH
659 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
660 mobj = re.search(self._NEXT_URL_RE, url)
661 if mobj:
7e8c0af0 662 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
97665381 663 video_id = self.extract_id(url)
c5e8d7af
PH
664
665 # Get video webpage
7e8c0af0 666 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
a1f934b1
PH
667 pref_cookies = [
668 c for c in self._downloader.cookiejar
669 if c.domain == '.youtube.com' and c.name == 'PREF']
670 for pc in pref_cookies:
671 if 'hl=' in pc.value:
672 pc.value = re.sub(r'hl=[^&]+', 'hl=en', pc.value)
673 else:
674 if pc.value:
675 pc.value += '&'
676 pc.value += 'hl=en'
677 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
678
679 # Attempt to extract SWF player URL
e0df6211 680 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
681 if mobj is not None:
682 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
683 else:
684 player_url = None
685
686 # Get video info
687 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
688 if re.search(r'player-age-gate-content">', video_webpage) is not None:
689 self.report_age_confirmation()
690 age_gate = True
691 # We simulate the access to the video from www.youtube.com/v/{video_id}
692 # this can be viewed without login into Youtube
2c57c7fa
JMF
693 data = compat_urllib_parse.urlencode({
694 'video_id': video_id,
695 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934
JMF
696 'sts': self._search_regex(
697 r'"sts"\s*:\s*(\d+)', video_webpage, 'sts'),
2c57c7fa 698 })
7e8c0af0 699 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
700 video_info_webpage = self._download_webpage(video_info_url, video_id,
701 note=False,
702 errnote='unable to download video info webpage')
703 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
704 else:
705 age_gate = False
706 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
7e8c0af0 707 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
c108eb73
JMF
708 % (video_id, el_type))
709 video_info_webpage = self._download_webpage(video_info_url, video_id,
710 note=False,
711 errnote='unable to download video info webpage')
712 video_info = compat_parse_qs(video_info_webpage)
713 if 'token' in video_info:
714 break
c5e8d7af
PH
715 if 'token' not in video_info:
716 if 'reason' in video_info:
d11271dd 717 raise ExtractorError(
78caa52a 718 'YouTube said: %s' % video_info['reason'][0],
d11271dd 719 expected=True, video_id=video_id)
c5e8d7af 720 else:
d11271dd 721 raise ExtractorError(
78caa52a 722 '"token" parameter not in video info for unknown reason',
d11271dd 723 video_id=video_id)
c5e8d7af 724
1d699755
PH
725 if 'view_count' in video_info:
726 view_count = int(video_info['view_count'][0])
727 else:
728 view_count = None
729
c5e8d7af
PH
730 # Check for "rental" videos
731 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
69ea8ca4 732 raise ExtractorError('"rental" videos not supported')
c5e8d7af
PH
733
734 # Start extracting information
735 self.report_information_extraction(video_id)
736
737 # uploader
738 if 'author' not in video_info:
69ea8ca4 739 raise ExtractorError('Unable to extract uploader name')
c5e8d7af
PH
740 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
741
742 # uploader_id
743 video_uploader_id = None
744 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
745 if mobj is not None:
746 video_uploader_id = mobj.group(1)
747 else:
69ea8ca4 748 self._downloader.report_warning('unable to extract uploader nickname')
c5e8d7af
PH
749
750 # title
a8c6b241 751 if 'title' in video_info:
aa92f063 752 video_title = video_info['title'][0]
a8c6b241 753 else:
69ea8ca4 754 self._downloader.report_warning('Unable to extract video title')
78caa52a 755 video_title = '_'
c5e8d7af
PH
756
757 # thumbnail image
7763b04e
JMF
758 # We try first to get a high quality image:
759 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
760 video_webpage, re.DOTALL)
761 if m_thumb is not None:
762 video_thumbnail = m_thumb.group(1)
763 elif 'thumbnail_url' not in video_info:
69ea8ca4 764 self._downloader.report_warning('unable to extract video thumbnail')
f490e77e 765 video_thumbnail = None
c5e8d7af
PH
766 else: # don't panic if we can't find it
767 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
768
769 # upload date
770 upload_date = None
ad3bc6ac 771 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
beee53de
PH
772 if mobj is None:
773 mobj = re.search(
263bd4ec 774 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
beee53de 775 video_webpage)
c5e8d7af
PH
776 if mobj is not None:
777 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
778 upload_date = unified_strdate(upload_date)
779
55f7bd2d
PH
780 m_cat_container = self._search_regex(
781 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
782 video_webpage, 'categories', fatal=False)
ec8deefc 783 if m_cat_container:
ad3bc6ac 784 category = self._html_search_regex(
01ed5c9b 785 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
786 default=None)
787 video_categories = None if category is None else [category]
788 else:
789 video_categories = None
ec8deefc 790
c5e8d7af
PH
791 # description
792 video_description = get_element_by_id("eow-description", video_webpage)
793 if video_description:
27dcce19
PH
794 video_description = re.sub(r'''(?x)
795 <a\s+
796 (?:[a-zA-Z-]+="[^"]+"\s+)*?
797 title="([^"]+)"\s+
798 (?:[a-zA-Z-]+="[^"]+"\s+)*?
799 class="yt-uix-redirect-link"\s*>
800 [^<]+
801 </a>
802 ''', r'\1', video_description)
c5e8d7af
PH
803 video_description = clean_html(video_description)
804 else:
805 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
806 if fd_mobj:
807 video_description = unescapeHTML(fd_mobj.group(1))
808 else:
78caa52a 809 video_description = ''
c5e8d7af 810
f30a38be 811 def _extract_count(count_name):
46374a56 812 count = self._search_regex(
f30a38be
JMF
813 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
814 video_webpage, count_name, default=None)
336c3a69
JMF
815 if count is not None:
816 return int(count.replace(',', ''))
817 return None
69ea8ca4
PH
818 like_count = _extract_count('like')
819 dislike_count = _extract_count('dislike')
336c3a69 820
c5e8d7af 821 # subtitles
d82134c3 822 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 823
c5e8d7af 824 if self._downloader.params.get('listsubtitles', False):
d665f8d3 825 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
826 return
827
828 if 'length_seconds' not in video_info:
69ea8ca4 829 self._downloader.report_warning('unable to extract video duration')
b466b702 830 video_duration = None
c5e8d7af 831 else:
b466b702 832 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 833
1fb07d10
JG
834 # annotations
835 video_annotations = None
836 if self._downloader.params.get('writeannotations', False):
837 video_annotations = self._extract_annotations(video_id)
838
c5e8d7af 839 # Decide which formats to download
c5e8d7af 840 try:
ae7ed920 841 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
50be92c1
PH
842 if not mobj:
843 raise ValueError('Could not find vevo ID')
ae7ed920
PH
844 json_code = uppercase_escape(mobj.group(1))
845 ytplayer_config = json.loads(json_code)
3489b7d2 846 args = ytplayer_config['args']
7ce7e394
JMF
847 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
848 # this signatures are encrypted
44d46655 849 if 'url_encoded_fmt_stream_map' not in args:
69ea8ca4 850 raise ValueError('No stream_map present') # caught below
00fe14fc
JMF
851 re_signature = re.compile(r'[&,]s=')
852 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
7ce7e394 853 if m_s is not None:
69ea8ca4 854 self.to_screen('%s: Encrypted signatures detected.' % video_id)
c5e8d7af 855 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
78caa52a 856 m_s = re_signature.search(args.get('adaptive_fmts', ''))
b7a68384 857 if m_s is not None:
00fe14fc
JMF
858 if 'adaptive_fmts' in video_info:
859 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
37b6d5f6 860 else:
00fe14fc 861 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
c5e8d7af
PH
862 except ValueError:
863 pass
864
dd27fd17
PH
865 def _map_to_format_list(urlmap):
866 formats = []
867 for itag, video_real_url in urlmap.items():
868 dct = {
869 'format_id': itag,
870 'url': video_real_url,
871 'player_url': player_url,
872 }
0b65e5d4
PH
873 if itag in self._formats:
874 dct.update(self._formats[itag])
dd27fd17
PH
875 formats.append(dct)
876 return formats
877
c5e8d7af
PH
878 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
879 self.report_rtmp_download()
dd27fd17
PH
880 formats = [{
881 'format_id': '_rtmp',
882 'protocol': 'rtmp',
883 'url': video_info['conn'][0],
884 'player_url': player_url,
885 }]
00fe14fc
JMF
886 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
887 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
888 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 889 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 890 url_map = {}
00fe14fc 891 for url_data_str in encoded_url_map.split(','):
c5e8d7af 892 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
893 if 'itag' not in url_data or 'url' not in url_data:
894 continue
895 format_id = url_data['itag'][0]
896 url = url_data['url'][0]
897
898 if 'sig' in url_data:
899 url += '&signature=' + url_data['sig'][0]
900 elif 's' in url_data:
901 encrypted_sig = url_data['s'][0]
902
903 if not age_gate:
904 jsplayer_url_json = self._search_regex(
905 r'"assets":.+?"js":\s*("[^"]+")',
78caa52a 906 video_webpage, 'JS player URL')
201e9eaa
PH
907 player_url = json.loads(jsplayer_url_json)
908 if player_url is None:
909 player_url_json = self._search_regex(
910 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 911 video_webpage, 'age gate player URL')
201e9eaa
PH
912 player_url = json.loads(player_url_json)
913
914 if self._downloader.params.get('verbose'):
cf010131 915 if player_url is None:
201e9eaa
PH
916 player_version = 'unknown'
917 player_desc = 'unknown'
918 else:
919 if player_url.endswith('swf'):
920 player_version = self._search_regex(
921 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 922 'flash player', fatal=False)
201e9eaa 923 player_desc = 'flash player %s' % player_version
cf010131 924 else:
201e9eaa
PH
925 player_version = self._search_regex(
926 r'html5player-([^/]+?)(?:/html5player)?\.js',
927 player_url,
928 'html5 player', fatal=False)
78caa52a 929 player_desc = 'html5 player %s' % player_version
201e9eaa 930
60064c53 931 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 932 self.to_screen('{%s} signature length %s, %s' %
98eb1c3f 933 (format_id, parts_sizes, player_desc))
201e9eaa
PH
934
935 signature = self._decrypt_signature(
936 encrypted_sig, video_id, player_url, age_gate)
937 url += '&signature=' + signature
938 if 'ratebypass' not in url:
939 url += '&ratebypass=yes'
940 url_map[format_id] = url
dd27fd17 941 formats = _map_to_format_list(url_map)
1d043b93
JMF
942 elif video_info.get('hlsvp'):
943 manifest_url = video_info['hlsvp'][0]
944 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 945 formats = _map_to_format_list(url_map)
c5e8d7af 946 else:
69ea8ca4 947 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 948
dd27fd17 949 # Look for the DASH manifest
203fb43f 950 if self._downloader.params.get('youtube_include_dash_manifest', True):
dd27fd17 951 try:
d68f0cdb 952 # The DASH manifest used needs to be the one from the original video_webpage.
953 # The one found in get_video_info seems to be using different signatures.
954 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
955 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
956 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
957 if age_gate:
3489b7d2 958 dash_manifest_url = video_info.get('dashmpd')[0]
d68f0cdb 959 else:
3489b7d2 960 dash_manifest_url = ytplayer_config['args']['dashmpd']
d68f0cdb 961 def decrypt_sig(mobj):
962 s = mobj.group(1)
963 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
964 return '/signature/%s' % dec_s
965 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
dd27fd17 966 dash_doc = self._download_xml(
d68f0cdb 967 dash_manifest_url, video_id,
69ea8ca4
PH
968 note='Downloading DASH manifest',
969 errnote='Could not download DASH manifest')
970 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
dd27fd17
PH
971 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
972 if url_el is None:
973 continue
974 format_id = r.attrib['id']
975 video_url = url_el.text
976 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
977 f = {
978 'format_id': format_id,
979 'url': video_url,
980 'width': int_or_none(r.attrib.get('width')),
981 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
982 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
983 'filesize': filesize,
984 }
985 try:
986 existing_format = next(
987 fo for fo in formats
988 if fo['format_id'] == format_id)
989 except StopIteration:
990 f.update(self._formats.get(format_id, {}))
991 formats.append(f)
992 else:
993 existing_format.update(f)
994
995 except (ExtractorError, KeyError) as e:
69ea8ca4 996 self.report_warning('Skipping DASH manifest: %s' % e, video_id)
d80044c2 997
4bcc7bd1 998 self._sort_formats(formats)
4ea3be0a 999
1000 return {
1001 'id': video_id,
1002 'uploader': video_uploader,
1003 'uploader_id': video_uploader_id,
1004 'upload_date': upload_date,
1005 'title': video_title,
1006 'thumbnail': video_thumbnail,
1007 'description': video_description,
ec8deefc 1008 'categories': video_categories,
4ea3be0a 1009 'subtitles': video_subtitles,
1010 'duration': video_duration,
1011 'age_limit': 18 if age_gate else 0,
1012 'annotations': video_annotations,
7e8c0af0 1013 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
4ea3be0a 1014 'view_count': view_count,
1015 'like_count': like_count,
1016 'dislike_count': dislike_count,
1017 'formats': formats,
1018 }
c5e8d7af 1019
880e1c52 1020class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
78caa52a 1021 IE_DESC = 'YouTube.com playlists'
d67cc9fa 1022 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1023 (?:https?://)?
1024 (?:\w+\.)?
1025 youtube\.com/
1026 (?:
ac7553d0 1027 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
c5e8d7af
PH
1028 \? (?:.*?&)*? (?:p|a|list)=
1029 | p/
1030 )
d67cc9fa 1031 (
7d568f5a 1032 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
d67cc9fa
JMF
1033 # Top tracks, they can also include dots
1034 |(?:MC)[\w\.]*
1035 )
c5e8d7af
PH
1036 .*
1037 |
7d568f5a 1038 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1039 )"""
dbb94fb0 1040 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dcbb4580 1041 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
dbb94fb0 1042 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
78caa52a 1043 IE_NAME = 'youtube:playlist'
81127aa5
PH
1044 _TESTS = [{
1045 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1046 'info_dict': {
1047 'title': 'ytdl test PL',
1048 },
1049 'playlist_count': 3,
9291475f
PH
1050 }, {
1051 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1052 'info_dict': {
1053 'title': 'YDL_Empty_List',
1054 },
1055 'playlist_count': 0,
1056 }, {
1057 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1058 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1059 'info_dict': {
1060 'title': '29C3: Not my department',
1061 },
1062 'playlist_count': 95,
1063 }, {
1064 'note': 'issue #673',
1065 'url': 'PLBB231211A4F62143',
1066 'info_dict': {
f46a8702 1067 'title': '[OLD]Team Fortress 2 (Class-based LP)',
9291475f
PH
1068 },
1069 'playlist_mincount': 26,
1070 }, {
1071 'note': 'Large playlist',
1072 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1073 'info_dict': {
1074 'title': 'Uploads from Cauchemar',
1075 },
1076 'playlist_mincount': 799,
1077 }, {
1078 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1079 'info_dict': {
1080 'title': 'YDL_safe_search',
1081 },
1082 'playlist_count': 2,
ac7553d0
PH
1083 }, {
1084 'note': 'embedded',
1085 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1086 'playlist_count': 4,
1087 'info_dict': {
1088 'title': 'JODA15',
1089 }
6b08cdf6
PH
1090 }, {
1091 'note': 'Embedded SWF player',
1092 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1093 'playlist_count': 4,
1094 'info_dict': {
1095 'title': 'JODA7',
1096 }
81127aa5 1097 }]
c5e8d7af 1098
880e1c52
JMF
1099 def _real_initialize(self):
1100 self._login()
1101
652cdaa2 1102 def _ids_to_results(self, ids):
c9cc0bf5
PH
1103 return [
1104 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1105 for vid_id in ids]
652cdaa2
JMF
1106
1107 def _extract_mix(self, playlist_id):
1108 # The mixes are generated from a a single video
1109 # the id of the playlist is just 'RD' + video_id
7d4afc55 1110 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
c9cc0bf5 1111 webpage = self._download_webpage(
78caa52a 1112 url, playlist_id, 'Downloading Youtube mix')
bc2f773b 1113 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
1114 title_span = (
1115 search_title('playlist-title') or
1116 search_title('title long-title') or
1117 search_title('title'))
76d1700b 1118 title = clean_html(title_span)
c9cc0bf5
PH
1119 ids = orderedSet(re.findall(
1120 r'''(?xs)data-video-username=".*?".*?
1121 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1122 webpage))
652cdaa2
JMF
1123 url_results = self._ids_to_results(ids)
1124
1125 return self.playlist_result(url_results, playlist_id, title)
1126
c5e8d7af
PH
1127 def _real_extract(self, url):
1128 # Extract playlist id
d67cc9fa 1129 mobj = re.match(self._VALID_URL, url)
c5e8d7af 1130 if mobj is None:
69ea8ca4 1131 raise ExtractorError('Invalid URL: %s' % url)
47192f92
FV
1132 playlist_id = mobj.group(1) or mobj.group(2)
1133
1134 # Check if it's a video-specific URL
7c61bd36 1135 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1136 if 'v' in query_dict:
1137 video_id = query_dict['v'][0]
1138 if self._downloader.params.get('noplaylist'):
69ea8ca4 1139 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1140 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92 1141 else:
69ea8ca4 1142 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1143
7d4afc55 1144 if playlist_id.startswith('RD'):
652cdaa2
JMF
1145 # Mixes require a custom extraction process
1146 return self._extract_mix(playlist_id)
0a688bc0 1147 if playlist_id.startswith('TL'):
69ea8ca4 1148 raise ExtractorError('For downloading YouTube.com top lists, use '
78caa52a 1149 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1150
dbb94fb0
S
1151 url = self._TEMPLATE_URL % playlist_id
1152 page = self._download_webpage(url, playlist_id)
1153 more_widget_html = content_html = page
1154
10c0e2d8 1155 # Check if the playlist exists or is private
e399853d 1156 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
10c0e2d8 1157 raise ExtractorError(
78caa52a 1158 'The playlist doesn\'t exist or is private, use --username or '
10c0e2d8
JMF
1159 '--netrc to access it.',
1160 expected=True)
1161
dcbb4580
JMF
1162 # Extract the video ids from the playlist pages
1163 ids = []
c5e8d7af 1164
755eb032 1165 for page_num in itertools.count(1):
dbb94fb0 1166 matches = re.finditer(self._VIDEO_RE, content_html)
6e47b51e
JMF
1167 # We remove the duplicates and the link with index 0
1168 # (it's not the first video of the playlist)
1169 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1170 ids.extend(new_ids)
c5e8d7af 1171
dbb94fb0
S
1172 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1173 if not mobj:
c5e8d7af
PH
1174 break
1175
dbb94fb0 1176 more = self._download_json(
5912c639
PH
1177 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1178 'Downloading page #%s' % page_num,
1179 transform_source=uppercase_escape)
dbb94fb0
S
1180 content_html = more['content_html']
1181 more_widget_html = more['load_more_widget_html']
1182
1183 playlist_title = self._html_search_regex(
68eb8e90 1184 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
78caa52a 1185 page, 'title')
c5e8d7af 1186
652cdaa2 1187 url_results = self._ids_to_results(ids)
dcbb4580 1188 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1189
1190
0a688bc0 1191class YoutubeTopListIE(YoutubePlaylistIE):
78caa52a 1192 IE_NAME = 'youtube:toplist'
69ea8ca4 1193 IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"'
78caa52a 1194 ' (Example: "yttoplist:music:Top Tracks")')
0a688bc0 1195 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
cdc628a4
PH
1196 _TESTS = [{
1197 'url': 'yttoplist:music:Trending',
1198 'playlist_mincount': 5,
1199 'skip': 'Only works for logged-in users',
1200 }]
0a688bc0
JMF
1201
1202 def _real_extract(self, url):
1203 mobj = re.match(self._VALID_URL, url)
1204 channel = mobj.group('chann')
1205 title = mobj.group('title')
1206 query = compat_urllib_parse.urlencode({'title': title})
cdc628a4
PH
1207 channel_page = self._download_webpage(
1208 'https://www.youtube.com/%s' % channel, title)
1209 link = self._html_search_regex(
1210 r'''(?x)
1211 <a\s+href="([^"]+)".*?>\s*
1212 <span\s+class="branded-page-module-title-text">\s*
1213 <span[^>]*>.*?%s.*?</span>''' % re.escape(query),
1214 channel_page, 'list')
0a688bc0
JMF
1215 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1216
1217 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1218 ids = []
1219 # sometimes the webpage doesn't contain the videos
1220 # retry until we get them
1221 for i in itertools.count(0):
78caa52a 1222 msg = 'Downloading Youtube mix'
0a688bc0
JMF
1223 if i > 0:
1224 msg += ', retry #%d' % i
c9cc0bf5 1225
0a688bc0
JMF
1226 webpage = self._download_webpage(url, title, msg)
1227 ids = orderedSet(re.findall(video_re, webpage))
1228 if ids:
1229 break
1230 url_results = self._ids_to_results(ids)
1231 return self.playlist_result(url_results, playlist_title=title)
1232
1233
c5e8d7af 1234class YoutubeChannelIE(InfoExtractor):
78caa52a 1235 IE_DESC = 'YouTube.com channels'
c5e8d7af 1236 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1237 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
38c2e5b8 1238 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
78caa52a 1239 IE_NAME = 'youtube:channel'
cdc628a4
PH
1240 _TESTS = [{
1241 'note': 'paginated channel',
1242 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1243 'playlist_mincount': 91,
1244 }]
c5e8d7af
PH
1245
1246 def extract_videos_from_page(self, page):
1247 ids_in_page = []
1248 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1249 if mobj.group(1) not in ids_in_page:
1250 ids_in_page.append(mobj.group(1))
1251 return ids_in_page
1252
1253 def _real_extract(self, url):
1254 # Extract channel id
1255 mobj = re.match(self._VALID_URL, url)
1256 if mobj is None:
69ea8ca4 1257 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
1258
1259 # Download channel page
1260 channel_id = mobj.group(1)
1261 video_ids = []
b9643eed
JMF
1262 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1263 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1264 autogenerated = re.search(r'''(?x)
1265 class="[^"]*?(?:
1266 channel-header-autogenerated-label|
1267 yt-channel-title-autogenerated
1268 )[^"]*"''', channel_page) is not None
c5e8d7af 1269
b9643eed
JMF
1270 if autogenerated:
1271 # The videos are contained in a single page
1272 # the ajax pages can't be used, they are empty
1273 video_ids = self.extract_videos_from_page(channel_page)
1274 else:
1275 # Download all channel pages using the json-based channel_ajax query
1276 for pagenum in itertools.count(1):
1277 url = self._MORE_PAGES_URL % (pagenum, channel_id)
81c2f20b 1278 page = self._download_json(
69ea8ca4 1279 url, channel_id, note='Downloading page #%s' % pagenum,
81c2f20b
PH
1280 transform_source=uppercase_escape)
1281
b9643eed
JMF
1282 ids_in_page = self.extract_videos_from_page(page['content_html'])
1283 video_ids.extend(ids_in_page)
1284
1285 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1286 break
c5e8d7af 1287
69ea8ca4 1288 self._downloader.to_screen('[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
c5e8d7af 1289
7012b23c
PH
1290 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1291 for video_id in video_ids]
1292 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1293
1294
1295class YoutubeUserIE(InfoExtractor):
78caa52a 1296 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
c9ae7b95 1297 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
38c2e5b8 1298 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
c5e8d7af 1299 _GDATA_PAGE_SIZE = 50
38c2e5b8 1300 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
78caa52a 1301 IE_NAME = 'youtube:user'
c5e8d7af 1302
cdc628a4
PH
1303 _TESTS = [{
1304 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1305 'playlist_mincount': 320,
1306 'info_dict': {
1307 'title': 'TheLinuxFoundation',
1308 }
1309 }, {
1310 'url': 'ytuser:phihag',
1311 'only_matching': True,
1312 }]
1313
e3ea4790 1314 @classmethod
f4b05232 1315 def suitable(cls, url):
e3ea4790
JMF
1316 # Don't return True if the url can be extracted with other youtube
1317 # extractor, the regex would is too permissive and it would match.
1318 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1319 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1320 else: return super(YoutubeUserIE, cls).suitable(url)
1321
c5e8d7af
PH
1322 def _real_extract(self, url):
1323 # Extract username
1324 mobj = re.match(self._VALID_URL, url)
1325 if mobj is None:
69ea8ca4 1326 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
1327
1328 username = mobj.group(1)
1329
1330 # Download video ids using YouTube Data API. Result size per
1331 # query is limited (currently to 50 videos) so we need to query
1332 # page by page until there are no video ids - it means we got
1333 # all of them.
1334
b7ab0590 1335 def download_page(pagenum):
c5e8d7af
PH
1336 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1337
1338 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1339 page = self._download_webpage(
1340 gdata_url, username,
78caa52a 1341 'Downloading video ids from %d to %d' % (
b7ab0590 1342 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1343
fd9cf738
JMF
1344 try:
1345 response = json.loads(page)
1346 except ValueError as err:
69ea8ca4 1347 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
71c82637 1348 if 'entry' not in response['feed']:
b7ab0590 1349 return
fd9cf738 1350
c5e8d7af 1351 # Extract video identifiers
e302f9ce
PH
1352 entries = response['feed']['entry']
1353 for entry in entries:
1354 title = entry['title']['$t']
1355 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1356 yield {
e302f9ce
PH
1357 '_type': 'url',
1358 'url': video_id,
1359 'ie_key': 'Youtube',
b11cec41 1360 'id': video_id,
e302f9ce 1361 'title': title,
b7ab0590 1362 }
9c44d242 1363 url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1364
7012b23c
PH
1365 return self.playlist_result(url_results, playlist_title=username)
1366
b05654f0
PH
1367
1368class YoutubeSearchIE(SearchInfoExtractor):
78caa52a
PH
1369 IE_DESC = 'YouTube.com searches'
1370 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
b05654f0 1371 _MAX_RESULTS = 1000
78caa52a 1372 IE_NAME = 'youtube:search'
b05654f0
PH
1373 _SEARCH_KEY = 'ytsearch'
1374
b05654f0
PH
1375 def _get_n_results(self, query, n):
1376 """Get a specified number of results for a query"""
1377
1378 video_ids = []
1379 pagenum = 0
1380 limit = n
83d548ef 1381 PAGE_SIZE = 50
b05654f0 1382
83d548ef
PH
1383 while (PAGE_SIZE * pagenum) < limit:
1384 result_url = self._API_URL % (
1385 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1386 (PAGE_SIZE * pagenum) + 1)
7cc3570e 1387 data_json = self._download_webpage(
69ea8ca4
PH
1388 result_url, video_id='query "%s"' % query,
1389 note='Downloading page %s' % (pagenum + 1),
1390 errnote='Unable to download API page')
7cc3570e
PH
1391 data = json.loads(data_json)
1392 api_response = data['data']
1393
1394 if 'items' not in api_response:
07ad22b8 1395 raise ExtractorError(
78caa52a 1396 '[youtube] No video results', expected=True)
b05654f0
PH
1397
1398 new_ids = list(video['id'] for video in api_response['items'])
1399 video_ids += new_ids
1400
1401 limit = min(n, api_response['totalItems'])
1402 pagenum += 1
1403
1404 if len(video_ids) > n:
1405 video_ids = video_ids[:n]
7012b23c
PH
1406 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1407 for video_id in video_ids]
b05654f0 1408 return self.playlist_result(videos, query)
75dff0ee 1409
c9ae7b95 1410
a3dd9248 1411class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1412 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1413 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1414 _SEARCH_KEY = 'ytsearchdate'
78caa52a 1415 IE_DESC = 'YouTube.com searches, newest videos first'
75dff0ee 1416
c9ae7b95
PH
1417
1418class YoutubeSearchURLIE(InfoExtractor):
78caa52a
PH
1419 IE_DESC = 'YouTube.com search URLs'
1420 IE_NAME = 'youtube:search_url'
c9ae7b95 1421 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
1422 _TESTS = [{
1423 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1424 'playlist_mincount': 5,
1425 'info_dict': {
1426 'title': 'youtube-dl test video',
1427 }
1428 }]
c9ae7b95
PH
1429
1430 def _real_extract(self, url):
1431 mobj = re.match(self._VALID_URL, url)
1432 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1433
1434 webpage = self._download_webpage(url, query)
1435 result_code = self._search_regex(
78caa52a 1436 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
c9ae7b95
PH
1437
1438 part_codes = re.findall(
1439 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1440 entries = []
1441 for part_code in part_codes:
1442 part_title = self._html_search_regex(
6feb2d5e 1443 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1444 part_url_snippet = self._html_search_regex(
1445 r'(?s)href="([^"]+)"', part_code, 'item URL')
1446 part_url = compat_urlparse.urljoin(
1447 'https://www.youtube.com/', part_url_snippet)
1448 entries.append({
1449 '_type': 'url',
1450 'url': part_url,
1451 'title': part_title,
1452 })
1453
1454 return {
1455 '_type': 'playlist',
1456 'entries': entries,
1457 'title': query,
1458 }
1459
1460
75dff0ee 1461class YoutubeShowIE(InfoExtractor):
78caa52a 1462 IE_DESC = 'YouTube.com (multi-season) shows'
cdc628a4 1463 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 1464 IE_NAME = 'youtube:show'
cdc628a4
PH
1465 _TESTS = [{
1466 'url': 'http://www.youtube.com/show/airdisasters',
1467 'playlist_mincount': 3,
1468 'info_dict': {
1469 'id': 'airdisasters',
1470 'title': 'Air Disasters',
1471 }
1472 }]
75dff0ee
JMF
1473
1474 def _real_extract(self, url):
1475 mobj = re.match(self._VALID_URL, url)
cdc628a4
PH
1476 playlist_id = mobj.group('id')
1477 webpage = self._download_webpage(
1478 url, playlist_id, 'Downloading show webpage')
75dff0ee
JMF
1479 # There's one playlist for each season of the show
1480 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
cdc628a4
PH
1481 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1482 entries = [
1483 self.url_result(
1484 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1485 for season in m_seasons
1486 ]
1487 title = self._og_search_title(webpage, fatal=False)
1488
1489 return {
1490 '_type': 'playlist',
1491 'id': playlist_id,
1492 'title': title,
1493 'entries': entries,
1494 }
04cc9617
JMF
1495
1496
b2e8bc1b 1497class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1498 """
1499 Base class for extractors that fetch info from
1500 http://www.youtube.com/feed_ajax
1501 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1502 """
b2e8bc1b 1503 _LOGIN_REQUIRED = True
43ba5456
JMF
1504 # use action_load_personal_feed instead of action_load_system_feed
1505 _PERSONAL_FEED = False
04cc9617 1506
d7ae0639
JMF
1507 @property
1508 def _FEED_TEMPLATE(self):
43ba5456
JMF
1509 action = 'action_load_system_feed'
1510 if self._PERSONAL_FEED:
1511 action = 'action_load_personal_feed'
38c2e5b8 1512 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1513
1514 @property
1515 def IE_NAME(self):
78caa52a 1516 return 'youtube:%s' % self._FEED_NAME
04cc9617 1517
81f0259b 1518 def _real_initialize(self):
b2e8bc1b 1519 self._login()
81f0259b 1520
04cc9617
JMF
1521 def _real_extract(self, url):
1522 feed_entries = []
0e44d838
JMF
1523 paging = 0
1524 for i in itertools.count(1):
f6177462 1525 info = self._download_json(self._FEED_TEMPLATE % paging,
78caa52a
PH
1526 '%s feed' % self._FEED_NAME,
1527 'Downloading page %s' % i)
f6177462 1528 feed_html = info.get('feed_html') or info.get('content_html')
1a9b9649 1529 load_more_widget_html = info.get('load_more_widget_html') or feed_html
43ba5456 1530 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1531 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1532 feed_entries.extend(
1533 self.url_result(video_id, 'Youtube', video_id=video_id)
1534 for video_id in ids)
05ee2b6d
JMF
1535 mobj = re.search(
1536 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1a9b9649 1537 load_more_widget_html)
05ee2b6d 1538 if mobj is None:
04cc9617 1539 break
05ee2b6d 1540 paging = mobj.group('paging')
d7ae0639
JMF
1541 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1542
d7ae0639 1543class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
78caa52a 1544 IE_DESC = 'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
d7ae0639
JMF
1545 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1546 _FEED_NAME = 'recommended'
78caa52a 1547 _PLAYLIST_TITLE = 'Youtube Recommended videos'
c626a3d9 1548
43ba5456 1549class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
78caa52a 1550 IE_DESC = 'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
43ba5456
JMF
1551 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1552 _FEED_NAME = 'watch_later'
78caa52a 1553 _PLAYLIST_TITLE = 'Youtube Watch Later'
43ba5456 1554 _PERSONAL_FEED = True
c626a3d9 1555
f459d170 1556class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
78caa52a
PH
1557 IE_DESC = 'Youtube watch history, "ythistory" keyword (requires authentication)'
1558 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
f459d170
JMF
1559 _FEED_NAME = 'history'
1560 _PERSONAL_FEED = True
78caa52a 1561 _PLAYLIST_TITLE = 'Youtube Watch History'
f459d170 1562
c626a3d9 1563class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a
PH
1564 IE_NAME = 'youtube:favorites'
1565 IE_DESC = 'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1566 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1567 _LOGIN_REQUIRED = True
1568
1569 def _real_extract(self, url):
1570 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 1571 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 1572 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1573
1574
1ed5b5c9 1575class YoutubeSubscriptionsIE(YoutubePlaylistIE):
78caa52a
PH
1576 IE_NAME = 'youtube:subscriptions'
1577 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1ed5b5c9 1578 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
81127aa5 1579 _TESTS = []
1ed5b5c9
JMF
1580
1581 def _real_extract(self, url):
78caa52a 1582 title = 'Youtube Subscriptions'
1ed5b5c9
JMF
1583 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1584
1585 # The extraction process is the same as for playlists, but the regex
1586 # for the video ids doesn't contain an index
1587 ids = []
1588 more_widget_html = content_html = page
1589
1590 for page_num in itertools.count(1):
1591 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1592 new_ids = orderedSet(matches)
1593 ids.extend(new_ids)
1594
1595 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1596 if not mobj:
1597 break
1598
1599 more = self._download_json(
1600 'https://youtube.com/%s' % mobj.group('more'), title,
1601 'Downloading page #%s' % page_num,
1602 transform_source=uppercase_escape)
1603 content_html = more['content_html']
1604 more_widget_html = more['load_more_widget_html']
1605
1606 return {
1607 '_type': 'playlist',
1608 'title': title,
1609 'entries': self._ids_to_results(ids),
1610 }
1611
1612
15870e90
PH
1613class YoutubeTruncatedURLIE(InfoExtractor):
1614 IE_NAME = 'youtube:truncated_url'
1615 IE_DESC = False # Do not list
975d35db 1616 _VALID_URL = r'''(?x)
c4808c60
PH
1617 (?:https?://)?[^/]+/watch\?(?:
1618 feature=[a-z_]+|
1619 annotation_id=annotation_[^&]+
1620 )?$|
975d35db
PH
1621 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1622 '''
15870e90 1623
c4808c60
PH
1624 _TESTS = [{
1625 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1626 'only_matching': True,
dc2fc736
PH
1627 }, {
1628 'url': 'http://www.youtube.com/watch?',
1629 'only_matching': True,
c4808c60
PH
1630 }]
1631
15870e90
PH
1632 def _real_extract(self, url):
1633 raise ExtractorError(
78caa52a
PH
1634 'Did you forget to quote the URL? Remember that & is a meta '
1635 'character in most shells, so you want to put the URL in quotes, '
1636 'like youtube-dl '
1637 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1638 ' or simply youtube-dl BaW_jenozKc .',
15870e90 1639 expected=True)