]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[youtube] Add webm audio formats (Fixes #4229)
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
e0df6211 10import traceback
c5e8d7af 11
b05654f0 12from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 13from .subtitles import SubtitlesInfoExtractor
2b25cb5d 14from ..jsinterp import JSInterpreter
54256267 15from ..swfinterp import SWFInterpreter
c5e8d7af 16from ..utils import (
edf3e38e 17 compat_chr,
c5e8d7af 18 compat_parse_qs,
c5e8d7af
PH
19 compat_urllib_parse,
20 compat_urllib_request,
7c61bd36 21 compat_urlparse,
c5e8d7af
PH
22 compat_str,
23
24 clean_html,
25 get_element_by_id,
652cdaa2 26 get_element_by_attribute,
c5e8d7af 27 ExtractorError,
dd27fd17 28 int_or_none,
9c44d242 29 OnDemandPagedList,
c5e8d7af
PH
30 unescapeHTML,
31 unified_strdate,
04cc9617 32 orderedSet,
81c2f20b 33 uppercase_escape,
c5e8d7af
PH
34)
35
de7f3446 36class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
37 """Provide base functions for Youtube extractors"""
38 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
83317f69 39 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
b2e8bc1b 40 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
38c2e5b8 41 _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
b2e8bc1b
JMF
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
45
b2e8bc1b 46 def _set_language(self):
7cc3570e
PH
47 return bool(self._download_webpage(
48 self._LANG_URL, None,
69ea8ca4 49 note='Setting language', errnote='unable to set language',
7cc3570e 50 fatal=False))
b2e8bc1b
JMF
51
52 def _login(self):
83317f69 53 """
54 Attempt to log in to YouTube.
55 True is returned if successful or skipped.
56 False is returned if login failed.
57
58 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
59 """
b2e8bc1b
JMF
60 (username, password) = self._get_login_info()
61 # No authentication to be performed
62 if username is None:
63 if self._LOGIN_REQUIRED:
69ea8ca4 64 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 65 return True
b2e8bc1b 66
7cc3570e
PH
67 login_page = self._download_webpage(
68 self._LOGIN_URL, None,
69ea8ca4
PH
69 note='Downloading login page',
70 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
71 if login_page is False:
72 return
b2e8bc1b 73
795f28f8 74 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78caa52a 75 login_page, 'Login GALX parameter')
c5e8d7af 76
b2e8bc1b
JMF
77 # Log in
78 login_form_strs = {
78caa52a
PH
79 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
80 'Email': username,
81 'GALX': galx,
82 'Passwd': password,
83
84 'PersistentCookie': 'yes',
85 '_utf8': '霱',
86 'bgresponse': 'js_disabled',
87 'checkConnection': '',
88 'checkedDomains': 'youtube',
89 'dnConn': '',
90 'pstMsg': '0',
91 'rmShown': '1',
92 'secTok': '',
93 'signIn': 'Sign in',
94 'timeStmp': '',
95 'service': 'youtube',
96 'uilel': '3',
97 'hl': 'en_US',
b2e8bc1b 98 }
83317f69 99
b2e8bc1b
JMF
100 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
101 # chokes on unicode
102 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
103 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
104
105 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
106 login_results = self._download_webpage(
107 req, None,
69ea8ca4 108 note='Logging in', errnote='unable to log in', fatal=False)
7cc3570e
PH
109 if login_results is False:
110 return False
83317f69 111
112 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
69ea8ca4 113 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
83317f69 114
115 # Two-Factor
116 # TODO add SMS and phone call support - these require making a request and then prompting the user
117
118 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
119 tfa_code = self._get_tfa_info()
120
121 if tfa_code is None:
69ea8ca4
PH
122 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
123 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
83317f69 124 return False
125
126 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
127
128 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
129 if match is None:
69ea8ca4 130 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
83317f69 131 secTok = match.group(1)
132 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
133 if match is None:
69ea8ca4 134 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
83317f69 135 timeStmp = match.group(1)
136
137 tfa_form_strs = {
78caa52a
PH
138 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
139 'smsToken': '',
140 'smsUserPin': tfa_code,
141 'smsVerifyPin': 'Verify',
142
143 'PersistentCookie': 'yes',
144 'checkConnection': '',
145 'checkedDomains': 'youtube',
146 'pstMsg': '1',
147 'secTok': secTok,
148 'timeStmp': timeStmp,
149 'service': 'youtube',
150 'hl': 'en_US',
83317f69 151 }
152 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in tfa_form_strs.items())
153 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
154
155 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
156 tfa_results = self._download_webpage(
157 tfa_req, None,
69ea8ca4 158 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
83317f69 159
160 if tfa_results is False:
161 return False
162
163 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
69ea8ca4 164 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
83317f69 165 return False
166 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
69ea8ca4 167 self._downloader.report_warning('unable to log in - did the page structure change?')
83317f69 168 return False
169 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
69ea8ca4 170 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
83317f69 171 return False
172
7cc3570e 173 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
69ea8ca4 174 self._downloader.report_warning('unable to log in: bad username or password')
b2e8bc1b
JMF
175 return False
176 return True
177
178 def _confirm_age(self):
179 age_form = {
7cc3570e
PH
180 'next_url': '/',
181 'action_confirm': 'Confirm',
182 }
5700e779
JMF
183 req = compat_urllib_request.Request(self._AGE_URL,
184 compat_urllib_parse.urlencode(age_form).encode('ascii'))
7cc3570e
PH
185
186 self._download_webpage(
187 req, None,
bfc2bedc
PH
188 note='Confirming age', errnote='Unable to confirm age',
189 fatal=False)
b2e8bc1b
JMF
190
191 def _real_initialize(self):
192 if self._downloader is None:
193 return
6b445558
PH
194 if self._get_login_info()[0] is not None:
195 if not self._set_language():
196 return
b2e8bc1b
JMF
197 if not self._login():
198 return
199 self._confirm_age()
c5e8d7af 200
8377574c 201
de7f3446 202class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
78caa52a 203 IE_DESC = 'YouTube.com'
cb7dfeea 204 _VALID_URL = r"""(?x)^
c5e8d7af 205 (
edb53e2d 206 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 207 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 208 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 209 (?:www\.)?pwnyoutube\.com/|
f7000f3a 210 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
211 tube\.majestyc\.net/|
212 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
213 (?:.*?\#/)? # handle anchor (#/) redirect urls
214 (?: # the various things that can precede the ID:
ac7553d0 215 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 216 |(?: # or the v= param in all its forms
f7000f3a 217 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
218 (?:\?|\#!?) # the params delimiter ? or # or #!
219 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
220 v=
221 )
f4b05232
JMF
222 ))
223 |youtu\.be/ # just youtu.be/xxxx
edb53e2d 224 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 225 )
c5e8d7af 226 )? # all until now is optional -> you can pass the naked ID
8963d9c2 227 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
9291475f 228 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
c5e8d7af
PH
229 (?(1).+)? # if we found the ID, everything can follow
230 $"""
c5e8d7af 231 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
232 _formats = {
233 '5': {'ext': 'flv', 'width': 400, 'height': 240},
234 '6': {'ext': 'flv', 'width': 450, 'height': 270},
235 '13': {'ext': '3gp'},
236 '17': {'ext': '3gp', 'width': 176, 'height': 144},
237 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
238 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
239 '34': {'ext': 'flv', 'width': 640, 'height': 360},
240 '35': {'ext': 'flv', 'width': 854, 'height': 480},
241 '36': {'ext': '3gp', 'width': 320, 'height': 240},
242 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
243 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
244 '43': {'ext': 'webm', 'width': 640, 'height': 360},
245 '44': {'ext': 'webm', 'width': 854, 'height': 480},
246 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
247 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
248
1d043b93 249
86fe61c8 250 # 3d videos
43b81eb9
PH
251 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
252 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
253 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
254 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
255 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
256 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
257 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 258
96fb5605 259 # Apple HTTP Live Streaming
43b81eb9
PH
260 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
261 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
262 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
263 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
264 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
265 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
266 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
267
268 # DASH mp4 video
43b81eb9
PH
269 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
270 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
271 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
272 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
273 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
274 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
275 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
276 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
0d2c1418
PH
277 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
278 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
279 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
836a086c 280
f6f1fc92 281 # Dash mp4 audio
2c62dc26
PH
282 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
283 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
284 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
285
286 # Dash webm
e75cafe9
A
287 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
288 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
289 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
290 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
291 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
292 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
1cc887cb 293 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
e75cafe9
A
294 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
295 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
296 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
297 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
298 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
299 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
300 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 301 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 302 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
a51d3aa0
PH
303 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
304 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
2c62dc26
PH
305
306 # Dash webm audio
55db73ef 307 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
e75cafe9 308 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d 309
0857baad
PH
310 # Dash webm audio with opus inside
311 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
312 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
313 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
314
ce6b9a2d
PH
315 # RTMP (unnamed)
316 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 317 }
836a086c 318
78caa52a 319 IE_NAME = 'youtube'
2eb88d95
PH
320 _TESTS = [
321 {
4bc3a23e
PH
322 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
323 'info_dict': {
324 'id': 'BaW_jenozKc',
325 'ext': 'mp4',
326 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
327 'uploader': 'Philipp Hagemeister',
328 'uploader_id': 'phihag',
329 'upload_date': '20121002',
330 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
331 'categories': ['Science & Technology'],
3e7c1224
PH
332 'like_count': int,
333 'dislike_count': int,
2eb88d95 334 }
0e853ca4 335 },
0e853ca4 336 {
4bc3a23e
PH
337 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
338 'note': 'Test generic use_cipher_signature video (#897)',
339 'info_dict': {
340 'id': 'UxxajLWwzqY',
341 'ext': 'mp4',
342 'upload_date': '20120506',
343 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
344 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
345 'uploader': 'Icona Pop',
346 'uploader_id': 'IconaPop',
2eb88d95 347 }
c108eb73
JMF
348 },
349 {
4bc3a23e
PH
350 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
351 'note': 'Test VEVO video with age protection (#956)',
352 'info_dict': {
353 'id': '07FYdnEawAQ',
354 'ext': 'mp4',
355 'upload_date': '20130703',
356 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
357 'description': 'md5:64249768eec3bc4276236606ea996373',
358 'uploader': 'justintimberlakeVEVO',
359 'uploader_id': 'justintimberlakeVEVO',
c108eb73
JMF
360 }
361 },
fccd3771 362 {
4bc3a23e
PH
363 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
364 'note': 'Embed-only video (#1746)',
365 'info_dict': {
366 'id': 'yZIXLfi8CZQ',
367 'ext': 'mp4',
368 'upload_date': '20120608',
369 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
370 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
371 'uploader': 'SET India',
372 'uploader_id': 'setindia'
fccd3771
PH
373 }
374 },
dd27fd17 375 {
4bc3a23e
PH
376 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
377 'note': '256k DASH audio (format 141) via DASH manifest',
378 'info_dict': {
379 'id': 'a9LDPn-MO4I',
380 'ext': 'm4a',
381 'upload_date': '20121002',
382 'uploader_id': '8KVIDEO',
383 'description': '',
384 'uploader': '8KVIDEO',
385 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 386 },
4bc3a23e
PH
387 'params': {
388 'youtube_include_dash_manifest': True,
389 'format': '141',
4919603f 390 },
dd27fd17 391 },
3489b7d2
JMF
392 # DASH manifest with encrypted signature
393 {
78caa52a
PH
394 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
395 'info_dict': {
396 'id': 'IB3lcPjvWLA',
397 'ext': 'm4a',
398 'title': 'Afrojack - The Spark ft. Spree Wilson',
399 'description': 'md5:9717375db5a9a3992be4668bbf3bc0a8',
400 'uploader': 'AfrojackVEVO',
401 'uploader_id': 'AfrojackVEVO',
402 'upload_date': '20131011',
3489b7d2 403 },
4bc3a23e 404 'params': {
78caa52a
PH
405 'youtube_include_dash_manifest': True,
406 'format': '141',
3489b7d2
JMF
407 },
408 },
2eb88d95
PH
409 ]
410
e0df6211
PH
411 def __init__(self, *args, **kwargs):
412 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 413 self._player_cache = {}
e0df6211 414
c5e8d7af
PH
415 def report_video_info_webpage_download(self, video_id):
416 """Report attempt to download video info webpage."""
69ea8ca4 417 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 418
c5e8d7af
PH
419 def report_information_extraction(self, video_id):
420 """Report attempt to extract video information."""
69ea8ca4 421 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
422
423 def report_unavailable_format(self, video_id, format):
424 """Report extracted video URL."""
69ea8ca4 425 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
426
427 def report_rtmp_download(self):
428 """Indicate the download will use the RTMP protocol."""
69ea8ca4 429 self.to_screen('RTMP download detected')
c5e8d7af 430
60064c53
PH
431 def _signature_cache_id(self, example_sig):
432 """ Return a string representation of a signature """
78caa52a 433 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
434
435 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 436 id_m = re.match(
c081b35c 437 r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
cf010131 438 player_url)
c081b35c
PH
439 if not id_m:
440 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
441 player_type = id_m.group('ext')
442 player_id = id_m.group('id')
443
c4417ddb 444 # Read from filesystem cache
60064c53
PH
445 func_id = '%s_%s_%s' % (
446 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 447 assert os.path.basename(func_id) == func_id
a0e07d31 448
69ea8ca4 449 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 450 if cache_spec is not None:
78caa52a 451 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 452
e0df6211
PH
453 if player_type == 'js':
454 code = self._download_webpage(
455 player_url, video_id,
69ea8ca4
PH
456 note='Downloading %s player %s' % (player_type, player_id),
457 errnote='Download of %s failed' % player_url)
83799698 458 res = self._parse_sig_js(code)
c4417ddb 459 elif player_type == 'swf':
e0df6211
PH
460 urlh = self._request_webpage(
461 player_url, video_id,
69ea8ca4
PH
462 note='Downloading %s player %s' % (player_type, player_id),
463 errnote='Download of %s failed' % player_url)
e0df6211 464 code = urlh.read()
83799698 465 res = self._parse_sig_swf(code)
e0df6211
PH
466 else:
467 assert False, 'Invalid player type %r' % player_type
468
a0e07d31 469 if cache_spec is None:
78caa52a 470 test_string = ''.join(map(compat_chr, range(len(example_sig))))
a0e07d31
PH
471 cache_res = res(test_string)
472 cache_spec = [ord(c) for c in cache_res]
83799698 473
69ea8ca4 474 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
475 return res
476
60064c53 477 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
478 def gen_sig_code(idxs):
479 def _genslice(start, end, step):
78caa52a 480 starts = '' if start == 0 else str(start)
69ea8ca4
PH
481 ends = (':%d' % (end+step)) if end + step >= 0 else ':'
482 steps = '' if step == 1 else (':%d' % step)
78caa52a 483 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
484
485 step = None
0ca96d48
PH
486 start = '(Never used)' # Quelch pyflakes warnings - start will be
487 # set as soon as step is set
edf3e38e
PH
488 for i, prev in zip(idxs[1:], idxs[:-1]):
489 if step is not None:
490 if i - prev == step:
491 continue
492 yield _genslice(start, prev, step)
493 step = None
494 continue
495 if i - prev in [-1, 1]:
496 step = i - prev
497 start = prev
498 continue
499 else:
78caa52a 500 yield 's[%d]' % prev
edf3e38e 501 if step is None:
78caa52a 502 yield 's[%d]' % i
edf3e38e
PH
503 else:
504 yield _genslice(start, i, step)
505
78caa52a 506 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 507 cache_res = func(test_string)
edf3e38e 508 cache_spec = [ord(c) for c in cache_res]
78caa52a 509 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
510 signature_id_tuple = '(%s)' % (
511 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 512 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 513 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 514 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 515
e0df6211
PH
516 def _parse_sig_js(self, jscode):
517 funcname = self._search_regex(
894dd868 518 r'\.sig\|\|([a-zA-Z0-9]+)\(', jscode,
78caa52a 519 'Initial JS player signature function name')
2b25cb5d
PH
520
521 jsi = JSInterpreter(jscode)
522 initial_function = jsi.extract_function(funcname)
e0df6211
PH
523 return lambda s: initial_function([s])
524
525 def _parse_sig_swf(self, file_contents):
54256267 526 swfi = SWFInterpreter(file_contents)
78caa52a 527 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 528 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 529 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
530 return lambda s: initial_function([s])
531
83799698 532 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 533 """Turn the encrypted s field into a working signature"""
6b37f0be 534
c8bf86d5 535 if player_url is None:
69ea8ca4 536 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 537
69ea8ca4 538 if player_url.startswith('//'):
78caa52a 539 player_url = 'https:' + player_url
c8bf86d5 540 try:
62af3a0e 541 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
542 if player_id not in self._player_cache:
543 func = self._extract_signature_function(
60064c53 544 video_id, player_url, s
c8bf86d5
PH
545 )
546 self._player_cache[player_id] = func
547 func = self._player_cache[player_id]
548 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 549 self._print_sig_code(func, s)
c8bf86d5
PH
550 return func(s)
551 except Exception as e:
552 tb = traceback.format_exc()
553 raise ExtractorError(
78caa52a 554 'Signature extraction failed: ' + tb, cause=e)
e0df6211 555
1f343eaa 556 def _get_available_subtitles(self, video_id, webpage):
de7f3446 557 try:
7fad1c63 558 sub_list = self._download_webpage(
38c2e5b8 559 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
560 video_id, note=False)
561 except ExtractorError as err:
69ea8ca4 562 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
de7f3446
JMF
563 return {}
564 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
565
566 sub_lang_list = {}
567 for l in lang_list:
568 lang = l[1]
7e660ac1
LD
569 if lang in sub_lang_list:
570 continue
de7f3446
JMF
571 params = compat_urllib_parse.urlencode({
572 'lang': lang,
573 'v': video_id,
ca715127 574 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
c3197e3e 575 'name': unescapeHTML(l[0]).encode('utf-8'),
de7f3446 576 })
78caa52a 577 url = 'https://www.youtube.com/api/timedtext?' + params
de7f3446
JMF
578 sub_lang_list[lang] = url
579 if not sub_lang_list:
69ea8ca4 580 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
581 return {}
582 return sub_lang_list
583
055e6f36 584 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
585 """We need the webpage for getting the captions url, pass it as an
586 argument to speed up the process."""
ca715127 587 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
69ea8ca4 588 self.to_screen('%s: Looking for automatic captions' % video_id)
de7f3446 589 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
78caa52a 590 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
591 if mobj is None:
592 self._downloader.report_warning(err_msg)
593 return {}
594 player_config = json.loads(mobj.group(1))
595 try:
596 args = player_config[u'args']
597 caption_url = args[u'ttsurl']
598 timestamp = args[u'timestamp']
055e6f36
JMF
599 # We get the available subtitles
600 list_params = compat_urllib_parse.urlencode({
601 'type': 'list',
602 'tlangs': 1,
603 'asrs': 1,
de7f3446 604 })
055e6f36 605 list_url = caption_url + '&' + list_params
e26f8712 606 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 607 original_lang_node = caption_list.find('track')
f6a54188 608 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
69ea8ca4 609 self._downloader.report_warning('Video doesn\'t have automatic captions')
e3dc22ca
JMF
610 return {}
611 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
612
613 sub_lang_list = {}
614 for lang_node in caption_list.findall('target'):
615 sub_lang = lang_node.attrib['lang_code']
616 params = compat_urllib_parse.urlencode({
617 'lang': original_lang,
618 'tlang': sub_lang,
619 'fmt': sub_format,
620 'ts': timestamp,
621 'kind': 'asr',
622 })
623 sub_lang_list[sub_lang] = caption_url + '&' + params
624 return sub_lang_list
de7f3446
JMF
625 # An extractor error can be raise by the download process if there are
626 # no automatic captions but there are subtitles
627 except (KeyError, ExtractorError):
628 self._downloader.report_warning(err_msg)
629 return {}
630
97665381
PH
631 @classmethod
632 def extract_id(cls, url):
633 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 634 if mobj is None:
69ea8ca4 635 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
636 video_id = mobj.group(2)
637 return video_id
638
1d043b93
JMF
639 def _extract_from_m3u8(self, manifest_url, video_id):
640 url_map = {}
641 def _get_urls(_manifest):
642 lines = _manifest.split('\n')
643 urls = filter(lambda l: l and not l.startswith('#'),
644 lines)
645 return urls
78caa52a 646 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
1d043b93
JMF
647 formats_urls = _get_urls(manifest)
648 for format_url in formats_urls:
890f62e8 649 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
650 url_map[itag] = format_url
651 return url_map
652
1fb07d10
JG
653 def _extract_annotations(self, video_id):
654 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 655 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 656
c5e8d7af 657 def _real_extract(self, url):
7e8c0af0 658 proto = (
78caa52a
PH
659 'http' if self._downloader.params.get('prefer_insecure', False)
660 else 'https')
7e8c0af0 661
c5e8d7af
PH
662 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
663 mobj = re.search(self._NEXT_URL_RE, url)
664 if mobj:
7e8c0af0 665 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
97665381 666 video_id = self.extract_id(url)
c5e8d7af
PH
667
668 # Get video webpage
7e8c0af0 669 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
a1f934b1
PH
670 pref_cookies = [
671 c for c in self._downloader.cookiejar
672 if c.domain == '.youtube.com' and c.name == 'PREF']
673 for pc in pref_cookies:
674 if 'hl=' in pc.value:
675 pc.value = re.sub(r'hl=[^&]+', 'hl=en', pc.value)
676 else:
677 if pc.value:
678 pc.value += '&'
679 pc.value += 'hl=en'
680 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
681
682 # Attempt to extract SWF player URL
e0df6211 683 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
684 if mobj is not None:
685 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
686 else:
687 player_url = None
688
689 # Get video info
690 self.report_video_info_webpage_download(video_id)
c108eb73 691 if re.search(r'player-age-gate-content">', video_webpage) is not None:
c108eb73
JMF
692 age_gate = True
693 # We simulate the access to the video from www.youtube.com/v/{video_id}
694 # this can be viewed without login into Youtube
2c57c7fa
JMF
695 data = compat_urllib_parse.urlencode({
696 'video_id': video_id,
697 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934 698 'sts': self._search_regex(
94bd3613 699 r'"sts"\s*:\s*(\d+)', video_webpage, 'sts', default=''),
2c57c7fa 700 })
7e8c0af0 701 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
94bd3613
PH
702 video_info_webpage = self._download_webpage(
703 video_info_url, video_id,
20436c30 704 note='Refetching age-gated info webpage',
94bd3613 705 errnote='unable to download video info webpage')
c5e8d7af 706 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
707 else:
708 age_gate = False
709 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
7e8c0af0 710 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
c108eb73
JMF
711 % (video_id, el_type))
712 video_info_webpage = self._download_webpage(video_info_url, video_id,
713 note=False,
714 errnote='unable to download video info webpage')
715 video_info = compat_parse_qs(video_info_webpage)
716 if 'token' in video_info:
717 break
c5e8d7af
PH
718 if 'token' not in video_info:
719 if 'reason' in video_info:
d11271dd 720 raise ExtractorError(
78caa52a 721 'YouTube said: %s' % video_info['reason'][0],
d11271dd 722 expected=True, video_id=video_id)
c5e8d7af 723 else:
d11271dd 724 raise ExtractorError(
78caa52a 725 '"token" parameter not in video info for unknown reason',
d11271dd 726 video_id=video_id)
c5e8d7af 727
1d699755
PH
728 if 'view_count' in video_info:
729 view_count = int(video_info['view_count'][0])
730 else:
731 view_count = None
732
c5e8d7af
PH
733 # Check for "rental" videos
734 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
69ea8ca4 735 raise ExtractorError('"rental" videos not supported')
c5e8d7af
PH
736
737 # Start extracting information
738 self.report_information_extraction(video_id)
739
740 # uploader
741 if 'author' not in video_info:
69ea8ca4 742 raise ExtractorError('Unable to extract uploader name')
c5e8d7af
PH
743 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
744
745 # uploader_id
746 video_uploader_id = None
747 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
748 if mobj is not None:
749 video_uploader_id = mobj.group(1)
750 else:
69ea8ca4 751 self._downloader.report_warning('unable to extract uploader nickname')
c5e8d7af
PH
752
753 # title
a8c6b241 754 if 'title' in video_info:
aa92f063 755 video_title = video_info['title'][0]
a8c6b241 756 else:
69ea8ca4 757 self._downloader.report_warning('Unable to extract video title')
78caa52a 758 video_title = '_'
c5e8d7af
PH
759
760 # thumbnail image
7763b04e
JMF
761 # We try first to get a high quality image:
762 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
763 video_webpage, re.DOTALL)
764 if m_thumb is not None:
765 video_thumbnail = m_thumb.group(1)
766 elif 'thumbnail_url' not in video_info:
69ea8ca4 767 self._downloader.report_warning('unable to extract video thumbnail')
f490e77e 768 video_thumbnail = None
c5e8d7af
PH
769 else: # don't panic if we can't find it
770 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
771
772 # upload date
773 upload_date = None
ad3bc6ac 774 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
beee53de
PH
775 if mobj is None:
776 mobj = re.search(
263bd4ec 777 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
beee53de 778 video_webpage)
c5e8d7af
PH
779 if mobj is not None:
780 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
781 upload_date = unified_strdate(upload_date)
782
55f7bd2d
PH
783 m_cat_container = self._search_regex(
784 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
785 video_webpage, 'categories', fatal=False)
ec8deefc 786 if m_cat_container:
ad3bc6ac 787 category = self._html_search_regex(
01ed5c9b 788 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
789 default=None)
790 video_categories = None if category is None else [category]
791 else:
792 video_categories = None
ec8deefc 793
c5e8d7af
PH
794 # description
795 video_description = get_element_by_id("eow-description", video_webpage)
796 if video_description:
27dcce19
PH
797 video_description = re.sub(r'''(?x)
798 <a\s+
799 (?:[a-zA-Z-]+="[^"]+"\s+)*?
800 title="([^"]+)"\s+
801 (?:[a-zA-Z-]+="[^"]+"\s+)*?
802 class="yt-uix-redirect-link"\s*>
803 [^<]+
804 </a>
805 ''', r'\1', video_description)
c5e8d7af
PH
806 video_description = clean_html(video_description)
807 else:
808 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
809 if fd_mobj:
810 video_description = unescapeHTML(fd_mobj.group(1))
811 else:
78caa52a 812 video_description = ''
c5e8d7af 813
f30a38be 814 def _extract_count(count_name):
46374a56 815 count = self._search_regex(
f30a38be
JMF
816 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
817 video_webpage, count_name, default=None)
336c3a69
JMF
818 if count is not None:
819 return int(count.replace(',', ''))
820 return None
69ea8ca4
PH
821 like_count = _extract_count('like')
822 dislike_count = _extract_count('dislike')
336c3a69 823
c5e8d7af 824 # subtitles
d82134c3 825 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 826
c5e8d7af 827 if self._downloader.params.get('listsubtitles', False):
d665f8d3 828 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
829 return
830
831 if 'length_seconds' not in video_info:
69ea8ca4 832 self._downloader.report_warning('unable to extract video duration')
b466b702 833 video_duration = None
c5e8d7af 834 else:
b466b702 835 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 836
1fb07d10
JG
837 # annotations
838 video_annotations = None
839 if self._downloader.params.get('writeannotations', False):
840 video_annotations = self._extract_annotations(video_id)
841
c5e8d7af 842 # Decide which formats to download
c5e8d7af 843 try:
ae7ed920 844 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
50be92c1
PH
845 if not mobj:
846 raise ValueError('Could not find vevo ID')
ae7ed920
PH
847 json_code = uppercase_escape(mobj.group(1))
848 ytplayer_config = json.loads(json_code)
3489b7d2 849 args = ytplayer_config['args']
7ce7e394
JMF
850 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
851 # this signatures are encrypted
44d46655 852 if 'url_encoded_fmt_stream_map' not in args:
69ea8ca4 853 raise ValueError('No stream_map present') # caught below
00fe14fc
JMF
854 re_signature = re.compile(r'[&,]s=')
855 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
7ce7e394 856 if m_s is not None:
69ea8ca4 857 self.to_screen('%s: Encrypted signatures detected.' % video_id)
c5e8d7af 858 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
78caa52a 859 m_s = re_signature.search(args.get('adaptive_fmts', ''))
b7a68384 860 if m_s is not None:
00fe14fc
JMF
861 if 'adaptive_fmts' in video_info:
862 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
37b6d5f6 863 else:
00fe14fc 864 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
c5e8d7af
PH
865 except ValueError:
866 pass
867
dd27fd17
PH
868 def _map_to_format_list(urlmap):
869 formats = []
870 for itag, video_real_url in urlmap.items():
871 dct = {
872 'format_id': itag,
873 'url': video_real_url,
874 'player_url': player_url,
875 }
0b65e5d4
PH
876 if itag in self._formats:
877 dct.update(self._formats[itag])
dd27fd17
PH
878 formats.append(dct)
879 return formats
880
c5e8d7af
PH
881 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
882 self.report_rtmp_download()
dd27fd17
PH
883 formats = [{
884 'format_id': '_rtmp',
885 'protocol': 'rtmp',
886 'url': video_info['conn'][0],
887 'player_url': player_url,
888 }]
00fe14fc
JMF
889 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
890 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
891 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 892 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 893 url_map = {}
00fe14fc 894 for url_data_str in encoded_url_map.split(','):
c5e8d7af 895 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
896 if 'itag' not in url_data or 'url' not in url_data:
897 continue
898 format_id = url_data['itag'][0]
899 url = url_data['url'][0]
900
901 if 'sig' in url_data:
902 url += '&signature=' + url_data['sig'][0]
903 elif 's' in url_data:
904 encrypted_sig = url_data['s'][0]
905
906 if not age_gate:
907 jsplayer_url_json = self._search_regex(
908 r'"assets":.+?"js":\s*("[^"]+")',
78caa52a 909 video_webpage, 'JS player URL')
201e9eaa
PH
910 player_url = json.loads(jsplayer_url_json)
911 if player_url is None:
912 player_url_json = self._search_regex(
913 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 914 video_webpage, 'age gate player URL')
201e9eaa
PH
915 player_url = json.loads(player_url_json)
916
917 if self._downloader.params.get('verbose'):
cf010131 918 if player_url is None:
201e9eaa
PH
919 player_version = 'unknown'
920 player_desc = 'unknown'
921 else:
922 if player_url.endswith('swf'):
923 player_version = self._search_regex(
924 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 925 'flash player', fatal=False)
201e9eaa 926 player_desc = 'flash player %s' % player_version
cf010131 927 else:
201e9eaa
PH
928 player_version = self._search_regex(
929 r'html5player-([^/]+?)(?:/html5player)?\.js',
930 player_url,
931 'html5 player', fatal=False)
78caa52a 932 player_desc = 'html5 player %s' % player_version
201e9eaa 933
60064c53 934 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 935 self.to_screen('{%s} signature length %s, %s' %
98eb1c3f 936 (format_id, parts_sizes, player_desc))
201e9eaa
PH
937
938 signature = self._decrypt_signature(
939 encrypted_sig, video_id, player_url, age_gate)
940 url += '&signature=' + signature
941 if 'ratebypass' not in url:
942 url += '&ratebypass=yes'
943 url_map[format_id] = url
dd27fd17 944 formats = _map_to_format_list(url_map)
1d043b93
JMF
945 elif video_info.get('hlsvp'):
946 manifest_url = video_info['hlsvp'][0]
947 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 948 formats = _map_to_format_list(url_map)
c5e8d7af 949 else:
69ea8ca4 950 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 951
dd27fd17 952 # Look for the DASH manifest
203fb43f 953 if self._downloader.params.get('youtube_include_dash_manifest', True):
dd27fd17 954 try:
d68f0cdb 955 # The DASH manifest used needs to be the one from the original video_webpage.
956 # The one found in get_video_info seems to be using different signatures.
957 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
958 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
959 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
960 if age_gate:
3489b7d2 961 dash_manifest_url = video_info.get('dashmpd')[0]
d68f0cdb 962 else:
3489b7d2 963 dash_manifest_url = ytplayer_config['args']['dashmpd']
d68f0cdb 964 def decrypt_sig(mobj):
965 s = mobj.group(1)
966 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
967 return '/signature/%s' % dec_s
968 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
dd27fd17 969 dash_doc = self._download_xml(
d68f0cdb 970 dash_manifest_url, video_id,
69ea8ca4
PH
971 note='Downloading DASH manifest',
972 errnote='Could not download DASH manifest')
973 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
dd27fd17
PH
974 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
975 if url_el is None:
976 continue
977 format_id = r.attrib['id']
978 video_url = url_el.text
979 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
980 f = {
981 'format_id': format_id,
982 'url': video_url,
983 'width': int_or_none(r.attrib.get('width')),
984 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
985 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
986 'filesize': filesize,
987 }
988 try:
989 existing_format = next(
990 fo for fo in formats
991 if fo['format_id'] == format_id)
992 except StopIteration:
993 f.update(self._formats.get(format_id, {}))
994 formats.append(f)
995 else:
996 existing_format.update(f)
997
998 except (ExtractorError, KeyError) as e:
23ad44b5 999 self.report_warning('Skipping DASH manifest: %r' % e, video_id)
d80044c2 1000
4bcc7bd1 1001 self._sort_formats(formats)
4ea3be0a 1002
1003 return {
1004 'id': video_id,
1005 'uploader': video_uploader,
1006 'uploader_id': video_uploader_id,
1007 'upload_date': upload_date,
1008 'title': video_title,
1009 'thumbnail': video_thumbnail,
1010 'description': video_description,
ec8deefc 1011 'categories': video_categories,
4ea3be0a 1012 'subtitles': video_subtitles,
1013 'duration': video_duration,
1014 'age_limit': 18 if age_gate else 0,
1015 'annotations': video_annotations,
7e8c0af0 1016 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
4ea3be0a 1017 'view_count': view_count,
1018 'like_count': like_count,
1019 'dislike_count': dislike_count,
1020 'formats': formats,
1021 }
c5e8d7af 1022
880e1c52 1023class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
78caa52a 1024 IE_DESC = 'YouTube.com playlists'
d67cc9fa 1025 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1026 (?:https?://)?
1027 (?:\w+\.)?
1028 youtube\.com/
1029 (?:
ac7553d0 1030 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
c5e8d7af
PH
1031 \? (?:.*?&)*? (?:p|a|list)=
1032 | p/
1033 )
d67cc9fa 1034 (
7d568f5a 1035 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
d67cc9fa
JMF
1036 # Top tracks, they can also include dots
1037 |(?:MC)[\w\.]*
1038 )
c5e8d7af
PH
1039 .*
1040 |
7d568f5a 1041 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1042 )"""
dbb94fb0 1043 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dcbb4580 1044 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
dbb94fb0 1045 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
78caa52a 1046 IE_NAME = 'youtube:playlist'
81127aa5
PH
1047 _TESTS = [{
1048 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1049 'info_dict': {
1050 'title': 'ytdl test PL',
a1cf99d0 1051 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
81127aa5
PH
1052 },
1053 'playlist_count': 3,
9291475f
PH
1054 }, {
1055 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1056 'info_dict': {
1057 'title': 'YDL_Empty_List',
1058 },
1059 'playlist_count': 0,
1060 }, {
1061 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1062 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1063 'info_dict': {
1064 'title': '29C3: Not my department',
1065 },
1066 'playlist_count': 95,
1067 }, {
1068 'note': 'issue #673',
1069 'url': 'PLBB231211A4F62143',
1070 'info_dict': {
f46a8702 1071 'title': '[OLD]Team Fortress 2 (Class-based LP)',
9291475f
PH
1072 },
1073 'playlist_mincount': 26,
1074 }, {
1075 'note': 'Large playlist',
1076 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1077 'info_dict': {
1078 'title': 'Uploads from Cauchemar',
1079 },
1080 'playlist_mincount': 799,
1081 }, {
1082 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1083 'info_dict': {
1084 'title': 'YDL_safe_search',
1085 },
1086 'playlist_count': 2,
ac7553d0
PH
1087 }, {
1088 'note': 'embedded',
1089 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1090 'playlist_count': 4,
1091 'info_dict': {
1092 'title': 'JODA15',
1093 }
6b08cdf6
PH
1094 }, {
1095 'note': 'Embedded SWF player',
1096 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1097 'playlist_count': 4,
1098 'info_dict': {
1099 'title': 'JODA7',
1100 }
81127aa5 1101 }]
c5e8d7af 1102
880e1c52
JMF
1103 def _real_initialize(self):
1104 self._login()
1105
652cdaa2 1106 def _ids_to_results(self, ids):
c9cc0bf5
PH
1107 return [
1108 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1109 for vid_id in ids]
652cdaa2
JMF
1110
1111 def _extract_mix(self, playlist_id):
1112 # The mixes are generated from a a single video
1113 # the id of the playlist is just 'RD' + video_id
7d4afc55 1114 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
c9cc0bf5 1115 webpage = self._download_webpage(
78caa52a 1116 url, playlist_id, 'Downloading Youtube mix')
bc2f773b 1117 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
1118 title_span = (
1119 search_title('playlist-title') or
1120 search_title('title long-title') or
1121 search_title('title'))
76d1700b 1122 title = clean_html(title_span)
c9cc0bf5
PH
1123 ids = orderedSet(re.findall(
1124 r'''(?xs)data-video-username=".*?".*?
1125 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1126 webpage))
652cdaa2
JMF
1127 url_results = self._ids_to_results(ids)
1128
1129 return self.playlist_result(url_results, playlist_id, title)
1130
c5e8d7af
PH
1131 def _real_extract(self, url):
1132 # Extract playlist id
d67cc9fa 1133 mobj = re.match(self._VALID_URL, url)
c5e8d7af 1134 if mobj is None:
69ea8ca4 1135 raise ExtractorError('Invalid URL: %s' % url)
47192f92
FV
1136 playlist_id = mobj.group(1) or mobj.group(2)
1137
1138 # Check if it's a video-specific URL
7c61bd36 1139 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1140 if 'v' in query_dict:
1141 video_id = query_dict['v'][0]
1142 if self._downloader.params.get('noplaylist'):
69ea8ca4 1143 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1144 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92 1145 else:
69ea8ca4 1146 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1147
7d4afc55 1148 if playlist_id.startswith('RD'):
652cdaa2
JMF
1149 # Mixes require a custom extraction process
1150 return self._extract_mix(playlist_id)
0a688bc0 1151 if playlist_id.startswith('TL'):
69ea8ca4 1152 raise ExtractorError('For downloading YouTube.com top lists, use '
78caa52a 1153 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1154
dbb94fb0
S
1155 url = self._TEMPLATE_URL % playlist_id
1156 page = self._download_webpage(url, playlist_id)
1157 more_widget_html = content_html = page
1158
10c0e2d8 1159 # Check if the playlist exists or is private
e399853d 1160 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
10c0e2d8 1161 raise ExtractorError(
78caa52a 1162 'The playlist doesn\'t exist or is private, use --username or '
10c0e2d8
JMF
1163 '--netrc to access it.',
1164 expected=True)
1165
dcbb4580
JMF
1166 # Extract the video ids from the playlist pages
1167 ids = []
c5e8d7af 1168
755eb032 1169 for page_num in itertools.count(1):
dbb94fb0 1170 matches = re.finditer(self._VIDEO_RE, content_html)
6e47b51e
JMF
1171 # We remove the duplicates and the link with index 0
1172 # (it's not the first video of the playlist)
1173 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1174 ids.extend(new_ids)
c5e8d7af 1175
dbb94fb0
S
1176 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1177 if not mobj:
c5e8d7af
PH
1178 break
1179
dbb94fb0 1180 more = self._download_json(
5912c639
PH
1181 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1182 'Downloading page #%s' % page_num,
1183 transform_source=uppercase_escape)
dbb94fb0
S
1184 content_html = more['content_html']
1185 more_widget_html = more['load_more_widget_html']
1186
1187 playlist_title = self._html_search_regex(
68eb8e90 1188 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
78caa52a 1189 page, 'title')
c5e8d7af 1190
652cdaa2 1191 url_results = self._ids_to_results(ids)
dcbb4580 1192 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1193
1194
0a688bc0 1195class YoutubeTopListIE(YoutubePlaylistIE):
78caa52a 1196 IE_NAME = 'youtube:toplist'
69ea8ca4 1197 IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"'
78caa52a 1198 ' (Example: "yttoplist:music:Top Tracks")')
0a688bc0 1199 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
cdc628a4
PH
1200 _TESTS = [{
1201 'url': 'yttoplist:music:Trending',
1202 'playlist_mincount': 5,
1203 'skip': 'Only works for logged-in users',
1204 }]
0a688bc0
JMF
1205
1206 def _real_extract(self, url):
1207 mobj = re.match(self._VALID_URL, url)
1208 channel = mobj.group('chann')
1209 title = mobj.group('title')
1210 query = compat_urllib_parse.urlencode({'title': title})
cdc628a4
PH
1211 channel_page = self._download_webpage(
1212 'https://www.youtube.com/%s' % channel, title)
1213 link = self._html_search_regex(
1214 r'''(?x)
1215 <a\s+href="([^"]+)".*?>\s*
1216 <span\s+class="branded-page-module-title-text">\s*
1217 <span[^>]*>.*?%s.*?</span>''' % re.escape(query),
1218 channel_page, 'list')
0a688bc0
JMF
1219 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1220
1221 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1222 ids = []
1223 # sometimes the webpage doesn't contain the videos
1224 # retry until we get them
1225 for i in itertools.count(0):
78caa52a 1226 msg = 'Downloading Youtube mix'
0a688bc0
JMF
1227 if i > 0:
1228 msg += ', retry #%d' % i
c9cc0bf5 1229
0a688bc0
JMF
1230 webpage = self._download_webpage(url, title, msg)
1231 ids = orderedSet(re.findall(video_re, webpage))
1232 if ids:
1233 break
1234 url_results = self._ids_to_results(ids)
1235 return self.playlist_result(url_results, playlist_title=title)
1236
1237
c5e8d7af 1238class YoutubeChannelIE(InfoExtractor):
78caa52a 1239 IE_DESC = 'YouTube.com channels'
c5e8d7af 1240 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1241 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
38c2e5b8 1242 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
78caa52a 1243 IE_NAME = 'youtube:channel'
cdc628a4
PH
1244 _TESTS = [{
1245 'note': 'paginated channel',
1246 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1247 'playlist_mincount': 91,
1248 }]
c5e8d7af
PH
1249
1250 def extract_videos_from_page(self, page):
1251 ids_in_page = []
1252 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1253 if mobj.group(1) not in ids_in_page:
1254 ids_in_page.append(mobj.group(1))
1255 return ids_in_page
1256
1257 def _real_extract(self, url):
1258 # Extract channel id
1259 mobj = re.match(self._VALID_URL, url)
1260 if mobj is None:
69ea8ca4 1261 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
1262
1263 # Download channel page
1264 channel_id = mobj.group(1)
1265 video_ids = []
b9643eed
JMF
1266 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1267 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1268 autogenerated = re.search(r'''(?x)
1269 class="[^"]*?(?:
1270 channel-header-autogenerated-label|
1271 yt-channel-title-autogenerated
1272 )[^"]*"''', channel_page) is not None
c5e8d7af 1273
b9643eed
JMF
1274 if autogenerated:
1275 # The videos are contained in a single page
1276 # the ajax pages can't be used, they are empty
1277 video_ids = self.extract_videos_from_page(channel_page)
1278 else:
1279 # Download all channel pages using the json-based channel_ajax query
1280 for pagenum in itertools.count(1):
1281 url = self._MORE_PAGES_URL % (pagenum, channel_id)
81c2f20b 1282 page = self._download_json(
69ea8ca4 1283 url, channel_id, note='Downloading page #%s' % pagenum,
81c2f20b
PH
1284 transform_source=uppercase_escape)
1285
b9643eed
JMF
1286 ids_in_page = self.extract_videos_from_page(page['content_html'])
1287 video_ids.extend(ids_in_page)
1288
1289 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1290 break
c5e8d7af 1291
69ea8ca4 1292 self._downloader.to_screen('[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
c5e8d7af 1293
7012b23c
PH
1294 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1295 for video_id in video_ids]
1296 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1297
1298
1299class YoutubeUserIE(InfoExtractor):
78caa52a 1300 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
c9ae7b95 1301 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
38c2e5b8 1302 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
c5e8d7af 1303 _GDATA_PAGE_SIZE = 50
38c2e5b8 1304 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
78caa52a 1305 IE_NAME = 'youtube:user'
c5e8d7af 1306
cdc628a4
PH
1307 _TESTS = [{
1308 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1309 'playlist_mincount': 320,
1310 'info_dict': {
1311 'title': 'TheLinuxFoundation',
1312 }
1313 }, {
1314 'url': 'ytuser:phihag',
1315 'only_matching': True,
1316 }]
1317
e3ea4790 1318 @classmethod
f4b05232 1319 def suitable(cls, url):
e3ea4790
JMF
1320 # Don't return True if the url can be extracted with other youtube
1321 # extractor, the regex would is too permissive and it would match.
1322 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1323 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1324 else: return super(YoutubeUserIE, cls).suitable(url)
1325
c5e8d7af
PH
1326 def _real_extract(self, url):
1327 # Extract username
1328 mobj = re.match(self._VALID_URL, url)
1329 if mobj is None:
69ea8ca4 1330 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
1331
1332 username = mobj.group(1)
1333
1334 # Download video ids using YouTube Data API. Result size per
1335 # query is limited (currently to 50 videos) so we need to query
1336 # page by page until there are no video ids - it means we got
1337 # all of them.
1338
b7ab0590 1339 def download_page(pagenum):
c5e8d7af
PH
1340 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1341
1342 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1343 page = self._download_webpage(
1344 gdata_url, username,
78caa52a 1345 'Downloading video ids from %d to %d' % (
b7ab0590 1346 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1347
fd9cf738
JMF
1348 try:
1349 response = json.loads(page)
1350 except ValueError as err:
69ea8ca4 1351 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
71c82637 1352 if 'entry' not in response['feed']:
b7ab0590 1353 return
fd9cf738 1354
c5e8d7af 1355 # Extract video identifiers
e302f9ce
PH
1356 entries = response['feed']['entry']
1357 for entry in entries:
1358 title = entry['title']['$t']
1359 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1360 yield {
e302f9ce
PH
1361 '_type': 'url',
1362 'url': video_id,
1363 'ie_key': 'Youtube',
b11cec41 1364 'id': video_id,
e302f9ce 1365 'title': title,
b7ab0590 1366 }
9c44d242 1367 url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1368
7012b23c
PH
1369 return self.playlist_result(url_results, playlist_title=username)
1370
b05654f0
PH
1371
1372class YoutubeSearchIE(SearchInfoExtractor):
78caa52a
PH
1373 IE_DESC = 'YouTube.com searches'
1374 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
b05654f0 1375 _MAX_RESULTS = 1000
78caa52a 1376 IE_NAME = 'youtube:search'
b05654f0
PH
1377 _SEARCH_KEY = 'ytsearch'
1378
b05654f0
PH
1379 def _get_n_results(self, query, n):
1380 """Get a specified number of results for a query"""
1381
1382 video_ids = []
1383 pagenum = 0
1384 limit = n
83d548ef 1385 PAGE_SIZE = 50
b05654f0 1386
83d548ef
PH
1387 while (PAGE_SIZE * pagenum) < limit:
1388 result_url = self._API_URL % (
1389 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1390 (PAGE_SIZE * pagenum) + 1)
7cc3570e 1391 data_json = self._download_webpage(
69ea8ca4
PH
1392 result_url, video_id='query "%s"' % query,
1393 note='Downloading page %s' % (pagenum + 1),
1394 errnote='Unable to download API page')
7cc3570e
PH
1395 data = json.loads(data_json)
1396 api_response = data['data']
1397
1398 if 'items' not in api_response:
07ad22b8 1399 raise ExtractorError(
78caa52a 1400 '[youtube] No video results', expected=True)
b05654f0
PH
1401
1402 new_ids = list(video['id'] for video in api_response['items'])
1403 video_ids += new_ids
1404
1405 limit = min(n, api_response['totalItems'])
1406 pagenum += 1
1407
1408 if len(video_ids) > n:
1409 video_ids = video_ids[:n]
7012b23c
PH
1410 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1411 for video_id in video_ids]
b05654f0 1412 return self.playlist_result(videos, query)
75dff0ee 1413
c9ae7b95 1414
a3dd9248 1415class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1416 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1417 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1418 _SEARCH_KEY = 'ytsearchdate'
78caa52a 1419 IE_DESC = 'YouTube.com searches, newest videos first'
75dff0ee 1420
c9ae7b95
PH
1421
1422class YoutubeSearchURLIE(InfoExtractor):
78caa52a
PH
1423 IE_DESC = 'YouTube.com search URLs'
1424 IE_NAME = 'youtube:search_url'
c9ae7b95 1425 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
1426 _TESTS = [{
1427 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1428 'playlist_mincount': 5,
1429 'info_dict': {
1430 'title': 'youtube-dl test video',
1431 }
1432 }]
c9ae7b95
PH
1433
1434 def _real_extract(self, url):
1435 mobj = re.match(self._VALID_URL, url)
1436 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1437
1438 webpage = self._download_webpage(url, query)
1439 result_code = self._search_regex(
78caa52a 1440 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
c9ae7b95
PH
1441
1442 part_codes = re.findall(
1443 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1444 entries = []
1445 for part_code in part_codes:
1446 part_title = self._html_search_regex(
6feb2d5e 1447 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1448 part_url_snippet = self._html_search_regex(
1449 r'(?s)href="([^"]+)"', part_code, 'item URL')
1450 part_url = compat_urlparse.urljoin(
1451 'https://www.youtube.com/', part_url_snippet)
1452 entries.append({
1453 '_type': 'url',
1454 'url': part_url,
1455 'title': part_title,
1456 })
1457
1458 return {
1459 '_type': 'playlist',
1460 'entries': entries,
1461 'title': query,
1462 }
1463
1464
75dff0ee 1465class YoutubeShowIE(InfoExtractor):
78caa52a 1466 IE_DESC = 'YouTube.com (multi-season) shows'
cdc628a4 1467 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 1468 IE_NAME = 'youtube:show'
cdc628a4
PH
1469 _TESTS = [{
1470 'url': 'http://www.youtube.com/show/airdisasters',
1471 'playlist_mincount': 3,
1472 'info_dict': {
1473 'id': 'airdisasters',
1474 'title': 'Air Disasters',
1475 }
1476 }]
75dff0ee
JMF
1477
1478 def _real_extract(self, url):
1479 mobj = re.match(self._VALID_URL, url)
cdc628a4
PH
1480 playlist_id = mobj.group('id')
1481 webpage = self._download_webpage(
1482 url, playlist_id, 'Downloading show webpage')
75dff0ee
JMF
1483 # There's one playlist for each season of the show
1484 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
cdc628a4
PH
1485 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1486 entries = [
1487 self.url_result(
1488 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1489 for season in m_seasons
1490 ]
1491 title = self._og_search_title(webpage, fatal=False)
1492
1493 return {
1494 '_type': 'playlist',
1495 'id': playlist_id,
1496 'title': title,
1497 'entries': entries,
1498 }
04cc9617
JMF
1499
1500
b2e8bc1b 1501class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1502 """
1503 Base class for extractors that fetch info from
1504 http://www.youtube.com/feed_ajax
1505 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1506 """
b2e8bc1b 1507 _LOGIN_REQUIRED = True
43ba5456
JMF
1508 # use action_load_personal_feed instead of action_load_system_feed
1509 _PERSONAL_FEED = False
04cc9617 1510
d7ae0639
JMF
1511 @property
1512 def _FEED_TEMPLATE(self):
43ba5456
JMF
1513 action = 'action_load_system_feed'
1514 if self._PERSONAL_FEED:
1515 action = 'action_load_personal_feed'
38c2e5b8 1516 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1517
1518 @property
1519 def IE_NAME(self):
78caa52a 1520 return 'youtube:%s' % self._FEED_NAME
04cc9617 1521
81f0259b 1522 def _real_initialize(self):
b2e8bc1b 1523 self._login()
81f0259b 1524
04cc9617
JMF
1525 def _real_extract(self, url):
1526 feed_entries = []
0e44d838
JMF
1527 paging = 0
1528 for i in itertools.count(1):
f6177462 1529 info = self._download_json(self._FEED_TEMPLATE % paging,
78caa52a
PH
1530 '%s feed' % self._FEED_NAME,
1531 'Downloading page %s' % i)
f6177462 1532 feed_html = info.get('feed_html') or info.get('content_html')
1a9b9649 1533 load_more_widget_html = info.get('load_more_widget_html') or feed_html
43ba5456 1534 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1535 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1536 feed_entries.extend(
1537 self.url_result(video_id, 'Youtube', video_id=video_id)
1538 for video_id in ids)
05ee2b6d
JMF
1539 mobj = re.search(
1540 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1a9b9649 1541 load_more_widget_html)
05ee2b6d 1542 if mobj is None:
04cc9617 1543 break
05ee2b6d 1544 paging = mobj.group('paging')
d7ae0639
JMF
1545 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1546
d7ae0639 1547class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
78caa52a 1548 IE_DESC = 'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
d7ae0639
JMF
1549 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1550 _FEED_NAME = 'recommended'
78caa52a 1551 _PLAYLIST_TITLE = 'Youtube Recommended videos'
c626a3d9 1552
43ba5456 1553class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
78caa52a 1554 IE_DESC = 'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
43ba5456
JMF
1555 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1556 _FEED_NAME = 'watch_later'
78caa52a 1557 _PLAYLIST_TITLE = 'Youtube Watch Later'
43ba5456 1558 _PERSONAL_FEED = True
c626a3d9 1559
f459d170 1560class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
78caa52a
PH
1561 IE_DESC = 'Youtube watch history, "ythistory" keyword (requires authentication)'
1562 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
f459d170
JMF
1563 _FEED_NAME = 'history'
1564 _PERSONAL_FEED = True
78caa52a 1565 _PLAYLIST_TITLE = 'Youtube Watch History'
f459d170 1566
c626a3d9 1567class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a
PH
1568 IE_NAME = 'youtube:favorites'
1569 IE_DESC = 'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1570 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1571 _LOGIN_REQUIRED = True
1572
1573 def _real_extract(self, url):
1574 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 1575 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 1576 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1577
1578
1ed5b5c9 1579class YoutubeSubscriptionsIE(YoutubePlaylistIE):
78caa52a
PH
1580 IE_NAME = 'youtube:subscriptions'
1581 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1ed5b5c9 1582 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
81127aa5 1583 _TESTS = []
1ed5b5c9
JMF
1584
1585 def _real_extract(self, url):
78caa52a 1586 title = 'Youtube Subscriptions'
1ed5b5c9
JMF
1587 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1588
1589 # The extraction process is the same as for playlists, but the regex
1590 # for the video ids doesn't contain an index
1591 ids = []
1592 more_widget_html = content_html = page
1593
1594 for page_num in itertools.count(1):
1595 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1596 new_ids = orderedSet(matches)
1597 ids.extend(new_ids)
1598
1599 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1600 if not mobj:
1601 break
1602
1603 more = self._download_json(
1604 'https://youtube.com/%s' % mobj.group('more'), title,
1605 'Downloading page #%s' % page_num,
1606 transform_source=uppercase_escape)
1607 content_html = more['content_html']
1608 more_widget_html = more['load_more_widget_html']
1609
1610 return {
1611 '_type': 'playlist',
1612 'title': title,
1613 'entries': self._ids_to_results(ids),
1614 }
1615
1616
15870e90
PH
1617class YoutubeTruncatedURLIE(InfoExtractor):
1618 IE_NAME = 'youtube:truncated_url'
1619 IE_DESC = False # Do not list
975d35db 1620 _VALID_URL = r'''(?x)
c4808c60
PH
1621 (?:https?://)?[^/]+/watch\?(?:
1622 feature=[a-z_]+|
1623 annotation_id=annotation_[^&]+
1624 )?$|
975d35db
PH
1625 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1626 '''
15870e90 1627
c4808c60
PH
1628 _TESTS = [{
1629 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1630 'only_matching': True,
dc2fc736
PH
1631 }, {
1632 'url': 'http://www.youtube.com/watch?',
1633 'only_matching': True,
c4808c60
PH
1634 }]
1635
15870e90
PH
1636 def _real_extract(self, url):
1637 raise ExtractorError(
78caa52a
PH
1638 'Did you forget to quote the URL? Remember that & is a meta '
1639 'character in most shells, so you want to put the URL in quotes, '
1640 'like youtube-dl '
1641 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1642 ' or simply youtube-dl BaW_jenozKc .',
15870e90 1643 expected=True)