]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[adultswim] PEP8
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
42939b61 10import time
e0df6211 11import traceback
c5e8d7af 12
b05654f0 13from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 14from .subtitles import SubtitlesInfoExtractor
2b25cb5d 15from ..jsinterp import JSInterpreter
54256267 16from ..swfinterp import SWFInterpreter
c5e8d7af 17from ..utils import (
edf3e38e 18 compat_chr,
c5e8d7af 19 compat_parse_qs,
c5e8d7af
PH
20 compat_urllib_parse,
21 compat_urllib_request,
7c61bd36 22 compat_urlparse,
c5e8d7af
PH
23 compat_str,
24
25 clean_html,
26 get_element_by_id,
652cdaa2 27 get_element_by_attribute,
c5e8d7af 28 ExtractorError,
dd27fd17 29 int_or_none,
9c44d242 30 OnDemandPagedList,
c5e8d7af
PH
31 unescapeHTML,
32 unified_strdate,
04cc9617 33 orderedSet,
81c2f20b 34 uppercase_escape,
c5e8d7af
PH
35)
36
5f6a1245 37
de7f3446 38class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
39 """Provide base functions for Youtube extractors"""
40 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
83317f69 41 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
b2e8bc1b
JMF
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
45
b2e8bc1b 46 def _set_language(self):
810fb84d
PH
47 self._set_cookie(
48 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
42939b61 49 # YouTube sets the expire time to about two months
810fb84d 50 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b
JMF
51
52 def _login(self):
83317f69 53 """
54 Attempt to log in to YouTube.
55 True is returned if successful or skipped.
56 False is returned if login failed.
57
58 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
59 """
b2e8bc1b
JMF
60 (username, password) = self._get_login_info()
61 # No authentication to be performed
62 if username is None:
63 if self._LOGIN_REQUIRED:
69ea8ca4 64 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 65 return True
b2e8bc1b 66
7cc3570e
PH
67 login_page = self._download_webpage(
68 self._LOGIN_URL, None,
69ea8ca4
PH
69 note='Downloading login page',
70 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
71 if login_page is False:
72 return
b2e8bc1b 73
795f28f8 74 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78caa52a 75 login_page, 'Login GALX parameter')
c5e8d7af 76
b2e8bc1b
JMF
77 # Log in
78 login_form_strs = {
8bcc8756
JW
79 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
80 'Email': username,
81 'GALX': galx,
82 'Passwd': password,
83
84 'PersistentCookie': 'yes',
85 '_utf8': '霱',
86 'bgresponse': 'js_disabled',
87 'checkConnection': '',
88 'checkedDomains': 'youtube',
89 'dnConn': '',
90 'pstMsg': '0',
91 'rmShown': '1',
92 'secTok': '',
93 'signIn': 'Sign in',
94 'timeStmp': '',
95 'service': 'youtube',
96 'uilel': '3',
97 'hl': 'en_US',
b2e8bc1b 98 }
83317f69 99
b2e8bc1b
JMF
100 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
101 # chokes on unicode
5f6a1245 102 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
b2e8bc1b 103 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
104
105 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
106 login_results = self._download_webpage(
107 req, None,
69ea8ca4 108 note='Logging in', errnote='unable to log in', fatal=False)
7cc3570e
PH
109 if login_results is False:
110 return False
83317f69 111
112 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
69ea8ca4 113 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
83317f69 114
115 # Two-Factor
116 # TODO add SMS and phone call support - these require making a request and then prompting the user
117
118 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
119 tfa_code = self._get_tfa_info()
120
121 if tfa_code is None:
69ea8ca4
PH
122 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
123 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
83317f69 124 return False
125
126 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
127
128 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
129 if match is None:
69ea8ca4 130 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
83317f69 131 secTok = match.group(1)
132 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
133 if match is None:
69ea8ca4 134 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
83317f69 135 timeStmp = match.group(1)
136
137 tfa_form_strs = {
78caa52a
PH
138 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
139 'smsToken': '',
140 'smsUserPin': tfa_code,
141 'smsVerifyPin': 'Verify',
142
143 'PersistentCookie': 'yes',
144 'checkConnection': '',
145 'checkedDomains': 'youtube',
146 'pstMsg': '1',
147 'secTok': secTok,
148 'timeStmp': timeStmp,
149 'service': 'youtube',
150 'hl': 'en_US',
83317f69 151 }
5f6a1245 152 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
83317f69 153 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
154
155 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
156 tfa_results = self._download_webpage(
157 tfa_req, None,
69ea8ca4 158 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
83317f69 159
160 if tfa_results is False:
161 return False
162
163 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
69ea8ca4 164 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
83317f69 165 return False
166 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
69ea8ca4 167 self._downloader.report_warning('unable to log in - did the page structure change?')
83317f69 168 return False
169 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
69ea8ca4 170 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
83317f69 171 return False
172
7cc3570e 173 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
69ea8ca4 174 self._downloader.report_warning('unable to log in: bad username or password')
b2e8bc1b
JMF
175 return False
176 return True
177
b2e8bc1b
JMF
178 def _real_initialize(self):
179 if self._downloader is None:
180 return
42939b61 181 self._set_language()
b2e8bc1b
JMF
182 if not self._login():
183 return
c5e8d7af 184
8377574c 185
de7f3446 186class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
78caa52a 187 IE_DESC = 'YouTube.com'
cb7dfeea 188 _VALID_URL = r"""(?x)^
c5e8d7af 189 (
edb53e2d 190 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 191 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 192 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 193 (?:www\.)?pwnyoutube\.com/|
f7000f3a 194 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
195 tube\.majestyc\.net/|
196 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
197 (?:.*?\#/)? # handle anchor (#/) redirect urls
198 (?: # the various things that can precede the ID:
ac7553d0 199 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 200 |(?: # or the v= param in all its forms
f7000f3a 201 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
202 (?:\?|\#!?) # the params delimiter ? or # or #!
203 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
204 v=
205 )
f4b05232
JMF
206 ))
207 |youtu\.be/ # just youtu.be/xxxx
edb53e2d 208 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 209 )
c5e8d7af 210 )? # all until now is optional -> you can pass the naked ID
8963d9c2 211 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
9291475f 212 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
c5e8d7af
PH
213 (?(1).+)? # if we found the ID, everything can follow
214 $"""
c5e8d7af 215 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
216 _formats = {
217 '5': {'ext': 'flv', 'width': 400, 'height': 240},
218 '6': {'ext': 'flv', 'width': 450, 'height': 270},
219 '13': {'ext': '3gp'},
220 '17': {'ext': '3gp', 'width': 176, 'height': 144},
221 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
222 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
223 '34': {'ext': 'flv', 'width': 640, 'height': 360},
224 '35': {'ext': 'flv', 'width': 854, 'height': 480},
225 '36': {'ext': '3gp', 'width': 320, 'height': 240},
226 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
227 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
228 '43': {'ext': 'webm', 'width': 640, 'height': 360},
229 '44': {'ext': 'webm', 'width': 854, 'height': 480},
230 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
231 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
232
1d043b93 233
86fe61c8 234 # 3d videos
43b81eb9
PH
235 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
236 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
237 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
238 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
239 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
240 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
241 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 242
96fb5605 243 # Apple HTTP Live Streaming
43b81eb9
PH
244 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
245 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
246 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
247 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
248 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
249 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
250 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
251
252 # DASH mp4 video
43b81eb9
PH
253 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
254 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
255 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
256 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
257 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
258 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
259 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
260 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
0d2c1418
PH
261 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
262 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
263 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
836a086c 264
f6f1fc92 265 # Dash mp4 audio
2c62dc26
PH
266 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
267 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
268 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
269
270 # Dash webm
e75cafe9
A
271 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
272 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
273 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
274 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
275 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
276 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
1cc887cb 277 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
e75cafe9
A
278 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
279 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
280 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
281 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
282 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
283 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
284 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 285 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 286 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
a51d3aa0
PH
287 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
288 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
10a404c3 289 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
2c62dc26
PH
290
291 # Dash webm audio
55db73ef 292 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
e75cafe9 293 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d 294
0857baad
PH
295 # Dash webm audio with opus inside
296 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
297 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
298 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
299
ce6b9a2d
PH
300 # RTMP (unnamed)
301 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 302 }
836a086c 303
78caa52a 304 IE_NAME = 'youtube'
2eb88d95
PH
305 _TESTS = [
306 {
4bc3a23e
PH
307 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
308 'info_dict': {
309 'id': 'BaW_jenozKc',
310 'ext': 'mp4',
311 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
312 'uploader': 'Philipp Hagemeister',
313 'uploader_id': 'phihag',
314 'upload_date': '20121002',
315 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
316 'categories': ['Science & Technology'],
3e7c1224
PH
317 'like_count': int,
318 'dislike_count': int,
2eb88d95 319 }
0e853ca4 320 },
0e853ca4 321 {
4bc3a23e
PH
322 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
323 'note': 'Test generic use_cipher_signature video (#897)',
324 'info_dict': {
325 'id': 'UxxajLWwzqY',
326 'ext': 'mp4',
327 'upload_date': '20120506',
328 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
329 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
330 'uploader': 'Icona Pop',
331 'uploader_id': 'IconaPop',
2eb88d95 332 }
c108eb73
JMF
333 },
334 {
4bc3a23e
PH
335 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
336 'note': 'Test VEVO video with age protection (#956)',
337 'info_dict': {
338 'id': '07FYdnEawAQ',
339 'ext': 'mp4',
340 'upload_date': '20130703',
341 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
342 'description': 'md5:64249768eec3bc4276236606ea996373',
343 'uploader': 'justintimberlakeVEVO',
344 'uploader_id': 'justintimberlakeVEVO',
c108eb73
JMF
345 }
346 },
fccd3771 347 {
4bc3a23e
PH
348 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
349 'note': 'Embed-only video (#1746)',
350 'info_dict': {
351 'id': 'yZIXLfi8CZQ',
352 'ext': 'mp4',
353 'upload_date': '20120608',
354 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
355 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
356 'uploader': 'SET India',
357 'uploader_id': 'setindia'
fccd3771
PH
358 }
359 },
dd27fd17 360 {
4bc3a23e
PH
361 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
362 'note': '256k DASH audio (format 141) via DASH manifest',
363 'info_dict': {
364 'id': 'a9LDPn-MO4I',
365 'ext': 'm4a',
366 'upload_date': '20121002',
367 'uploader_id': '8KVIDEO',
368 'description': '',
369 'uploader': '8KVIDEO',
370 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 371 },
4bc3a23e
PH
372 'params': {
373 'youtube_include_dash_manifest': True,
374 'format': '141',
4919603f 375 },
dd27fd17 376 },
3489b7d2
JMF
377 # DASH manifest with encrypted signature
378 {
78caa52a
PH
379 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
380 'info_dict': {
381 'id': 'IB3lcPjvWLA',
382 'ext': 'm4a',
b766eb27
JMF
383 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
384 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
78caa52a
PH
385 'uploader': 'AfrojackVEVO',
386 'uploader_id': 'AfrojackVEVO',
387 'upload_date': '20131011',
3489b7d2 388 },
4bc3a23e 389 'params': {
78caa52a
PH
390 'youtube_include_dash_manifest': True,
391 'format': '141',
3489b7d2
JMF
392 },
393 },
aa79ac0c
PH
394 # Controversy video
395 {
396 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
397 'info_dict': {
398 'id': 'T4XJQO3qol8',
399 'ext': 'mp4',
400 'upload_date': '20100909',
401 'uploader': 'The Amazing Atheist',
402 'uploader_id': 'TheAmazingAtheist',
403 'title': 'Burning Everyone\'s Koran',
404 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
405 }
c522adb1
JMF
406 },
407 # Normal age-gate video (No vevo, embed allowed)
408 {
409 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
410 'info_dict': {
411 'id': 'HtVdAasjOgU',
412 'ext': 'mp4',
413 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
414 'description': 'md5:eca57043abae25130f58f655ad9a7771',
415 'uploader': 'The Witcher',
416 'uploader_id': 'WitcherGame',
417 'upload_date': '20140605',
418 },
419 },
2eb88d95
PH
420 ]
421
e0df6211
PH
422 def __init__(self, *args, **kwargs):
423 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 424 self._player_cache = {}
e0df6211 425
c5e8d7af
PH
426 def report_video_info_webpage_download(self, video_id):
427 """Report attempt to download video info webpage."""
69ea8ca4 428 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 429
c5e8d7af
PH
430 def report_information_extraction(self, video_id):
431 """Report attempt to extract video information."""
69ea8ca4 432 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
433
434 def report_unavailable_format(self, video_id, format):
435 """Report extracted video URL."""
69ea8ca4 436 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
437
438 def report_rtmp_download(self):
439 """Indicate the download will use the RTMP protocol."""
69ea8ca4 440 self.to_screen('RTMP download detected')
c5e8d7af 441
60064c53
PH
442 def _signature_cache_id(self, example_sig):
443 """ Return a string representation of a signature """
78caa52a 444 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
445
446 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 447 id_m = re.match(
c081b35c 448 r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
cf010131 449 player_url)
c081b35c
PH
450 if not id_m:
451 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
452 player_type = id_m.group('ext')
453 player_id = id_m.group('id')
454
c4417ddb 455 # Read from filesystem cache
60064c53
PH
456 func_id = '%s_%s_%s' % (
457 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 458 assert os.path.basename(func_id) == func_id
a0e07d31 459
69ea8ca4 460 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 461 if cache_spec is not None:
78caa52a 462 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 463
e0df6211
PH
464 if player_type == 'js':
465 code = self._download_webpage(
466 player_url, video_id,
69ea8ca4
PH
467 note='Downloading %s player %s' % (player_type, player_id),
468 errnote='Download of %s failed' % player_url)
83799698 469 res = self._parse_sig_js(code)
c4417ddb 470 elif player_type == 'swf':
e0df6211
PH
471 urlh = self._request_webpage(
472 player_url, video_id,
69ea8ca4
PH
473 note='Downloading %s player %s' % (player_type, player_id),
474 errnote='Download of %s failed' % player_url)
e0df6211 475 code = urlh.read()
83799698 476 res = self._parse_sig_swf(code)
e0df6211
PH
477 else:
478 assert False, 'Invalid player type %r' % player_type
479
a0e07d31 480 if cache_spec is None:
78caa52a 481 test_string = ''.join(map(compat_chr, range(len(example_sig))))
a0e07d31
PH
482 cache_res = res(test_string)
483 cache_spec = [ord(c) for c in cache_res]
83799698 484
69ea8ca4 485 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
486 return res
487
60064c53 488 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
489 def gen_sig_code(idxs):
490 def _genslice(start, end, step):
78caa52a 491 starts = '' if start == 0 else str(start)
8bcc8756 492 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 493 steps = '' if step == 1 else (':%d' % step)
78caa52a 494 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
495
496 step = None
0ca96d48
PH
497 start = '(Never used)' # Quelch pyflakes warnings - start will be
498 # set as soon as step is set
edf3e38e
PH
499 for i, prev in zip(idxs[1:], idxs[:-1]):
500 if step is not None:
501 if i - prev == step:
502 continue
503 yield _genslice(start, prev, step)
504 step = None
505 continue
506 if i - prev in [-1, 1]:
507 step = i - prev
508 start = prev
509 continue
510 else:
78caa52a 511 yield 's[%d]' % prev
edf3e38e 512 if step is None:
78caa52a 513 yield 's[%d]' % i
edf3e38e
PH
514 else:
515 yield _genslice(start, i, step)
516
78caa52a 517 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 518 cache_res = func(test_string)
edf3e38e 519 cache_spec = [ord(c) for c in cache_res]
78caa52a 520 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
521 signature_id_tuple = '(%s)' % (
522 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 523 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 524 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 525 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 526
e0df6211
PH
527 def _parse_sig_js(self, jscode):
528 funcname = self._search_regex(
894dd868 529 r'\.sig\|\|([a-zA-Z0-9]+)\(', jscode,
8bcc8756 530 'Initial JS player signature function name')
2b25cb5d
PH
531
532 jsi = JSInterpreter(jscode)
533 initial_function = jsi.extract_function(funcname)
e0df6211
PH
534 return lambda s: initial_function([s])
535
536 def _parse_sig_swf(self, file_contents):
54256267 537 swfi = SWFInterpreter(file_contents)
78caa52a 538 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 539 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 540 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
541 return lambda s: initial_function([s])
542
83799698 543 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 544 """Turn the encrypted s field into a working signature"""
6b37f0be 545
c8bf86d5 546 if player_url is None:
69ea8ca4 547 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 548
69ea8ca4 549 if player_url.startswith('//'):
78caa52a 550 player_url = 'https:' + player_url
c8bf86d5 551 try:
62af3a0e 552 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
553 if player_id not in self._player_cache:
554 func = self._extract_signature_function(
60064c53 555 video_id, player_url, s
c8bf86d5
PH
556 )
557 self._player_cache[player_id] = func
558 func = self._player_cache[player_id]
559 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 560 self._print_sig_code(func, s)
c8bf86d5
PH
561 return func(s)
562 except Exception as e:
563 tb = traceback.format_exc()
564 raise ExtractorError(
78caa52a 565 'Signature extraction failed: ' + tb, cause=e)
e0df6211 566
1f343eaa 567 def _get_available_subtitles(self, video_id, webpage):
de7f3446 568 try:
7fad1c63 569 sub_list = self._download_webpage(
38c2e5b8 570 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
571 video_id, note=False)
572 except ExtractorError as err:
69ea8ca4 573 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
de7f3446
JMF
574 return {}
575 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
576
577 sub_lang_list = {}
578 for l in lang_list:
579 lang = l[1]
7e660ac1
LD
580 if lang in sub_lang_list:
581 continue
de7f3446
JMF
582 params = compat_urllib_parse.urlencode({
583 'lang': lang,
584 'v': video_id,
ca715127 585 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
c3197e3e 586 'name': unescapeHTML(l[0]).encode('utf-8'),
de7f3446 587 })
78caa52a 588 url = 'https://www.youtube.com/api/timedtext?' + params
de7f3446
JMF
589 sub_lang_list[lang] = url
590 if not sub_lang_list:
69ea8ca4 591 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
592 return {}
593 return sub_lang_list
594
055e6f36 595 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
596 """We need the webpage for getting the captions url, pass it as an
597 argument to speed up the process."""
ca715127 598 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
69ea8ca4 599 self.to_screen('%s: Looking for automatic captions' % video_id)
de7f3446 600 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
78caa52a 601 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
602 if mobj is None:
603 self._downloader.report_warning(err_msg)
604 return {}
605 player_config = json.loads(mobj.group(1))
606 try:
0792d563
PH
607 args = player_config['args']
608 caption_url = args['ttsurl']
609 timestamp = args['timestamp']
055e6f36
JMF
610 # We get the available subtitles
611 list_params = compat_urllib_parse.urlencode({
612 'type': 'list',
613 'tlangs': 1,
614 'asrs': 1,
de7f3446 615 })
055e6f36 616 list_url = caption_url + '&' + list_params
e26f8712 617 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 618 original_lang_node = caption_list.find('track')
5f6a1245 619 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr':
69ea8ca4 620 self._downloader.report_warning('Video doesn\'t have automatic captions')
e3dc22ca
JMF
621 return {}
622 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
623
624 sub_lang_list = {}
625 for lang_node in caption_list.findall('target'):
626 sub_lang = lang_node.attrib['lang_code']
627 params = compat_urllib_parse.urlencode({
628 'lang': original_lang,
629 'tlang': sub_lang,
630 'fmt': sub_format,
631 'ts': timestamp,
632 'kind': 'asr',
633 })
634 sub_lang_list[sub_lang] = caption_url + '&' + params
635 return sub_lang_list
de7f3446
JMF
636 # An extractor error can be raise by the download process if there are
637 # no automatic captions but there are subtitles
638 except (KeyError, ExtractorError):
639 self._downloader.report_warning(err_msg)
640 return {}
641
97665381
PH
642 @classmethod
643 def extract_id(cls, url):
644 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 645 if mobj is None:
69ea8ca4 646 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
647 video_id = mobj.group(2)
648 return video_id
649
1d043b93
JMF
650 def _extract_from_m3u8(self, manifest_url, video_id):
651 url_map = {}
5f6a1245 652
1d043b93
JMF
653 def _get_urls(_manifest):
654 lines = _manifest.split('\n')
655 urls = filter(lambda l: l and not l.startswith('#'),
8bcc8756 656 lines)
1d043b93 657 return urls
78caa52a 658 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
1d043b93
JMF
659 formats_urls = _get_urls(manifest)
660 for format_url in formats_urls:
890f62e8 661 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
662 url_map[itag] = format_url
663 return url_map
664
1fb07d10
JG
665 def _extract_annotations(self, video_id):
666 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 667 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 668
c5e8d7af 669 def _real_extract(self, url):
7e8c0af0 670 proto = (
78caa52a
PH
671 'http' if self._downloader.params.get('prefer_insecure', False)
672 else 'https')
7e8c0af0 673
c5e8d7af
PH
674 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
675 mobj = re.search(self._NEXT_URL_RE, url)
676 if mobj:
7e8c0af0 677 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
97665381 678 video_id = self.extract_id(url)
c5e8d7af
PH
679
680 # Get video webpage
aa79ac0c 681 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
a1f934b1 682 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
683
684 # Attempt to extract SWF player URL
e0df6211 685 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
686 if mobj is not None:
687 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
688 else:
689 player_url = None
690
691 # Get video info
c108eb73 692 if re.search(r'player-age-gate-content">', video_webpage) is not None:
c108eb73
JMF
693 age_gate = True
694 # We simulate the access to the video from www.youtube.com/v/{video_id}
695 # this can be viewed without login into Youtube
2c57c7fa
JMF
696 data = compat_urllib_parse.urlencode({
697 'video_id': video_id,
698 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934 699 'sts': self._search_regex(
94bd3613 700 r'"sts"\s*:\s*(\d+)', video_webpage, 'sts', default=''),
2c57c7fa 701 })
7e8c0af0 702 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
94bd3613
PH
703 video_info_webpage = self._download_webpage(
704 video_info_url, video_id,
20436c30 705 note='Refetching age-gated info webpage',
94bd3613 706 errnote='unable to download video info webpage')
c5e8d7af 707 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
708 else:
709 age_gate = False
4e62ebe2
JMF
710 try:
711 # Try looking directly into the video webpage
712 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
713 if not mobj:
714 raise ValueError('Could not find ytplayer.config') # caught below
715 json_code = uppercase_escape(mobj.group(1))
716 ytplayer_config = json.loads(json_code)
717 args = ytplayer_config['args']
718 # Convert to the same format returned by compat_parse_qs
719 video_info = dict((k, [v]) for k, v in args.items())
720 if 'url_encoded_fmt_stream_map' not in args:
721 raise ValueError('No stream_map present') # caught below
722 except ValueError:
723 # We fallback to the get_video_info pages (used by the embed page)
724 self.report_video_info_webpage_download(video_id)
725 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
810fb84d
PH
726 video_info_url = (
727 '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
728 % (proto, video_id, el_type))
729 video_info_webpage = self._download_webpage(
730 video_info_url,
4e62ebe2
JMF
731 video_id, note=False,
732 errnote='unable to download video info webpage')
733 video_info = compat_parse_qs(video_info_webpage)
734 if 'token' in video_info:
735 break
c5e8d7af
PH
736 if 'token' not in video_info:
737 if 'reason' in video_info:
d11271dd 738 raise ExtractorError(
78caa52a 739 'YouTube said: %s' % video_info['reason'][0],
d11271dd 740 expected=True, video_id=video_id)
c5e8d7af 741 else:
d11271dd 742 raise ExtractorError(
78caa52a 743 '"token" parameter not in video info for unknown reason',
d11271dd 744 video_id=video_id)
c5e8d7af 745
1d699755
PH
746 if 'view_count' in video_info:
747 view_count = int(video_info['view_count'][0])
748 else:
749 view_count = None
750
c5e8d7af
PH
751 # Check for "rental" videos
752 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
69ea8ca4 753 raise ExtractorError('"rental" videos not supported')
c5e8d7af
PH
754
755 # Start extracting information
756 self.report_information_extraction(video_id)
757
758 # uploader
759 if 'author' not in video_info:
69ea8ca4 760 raise ExtractorError('Unable to extract uploader name')
c5e8d7af
PH
761 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
762
763 # uploader_id
764 video_uploader_id = None
765 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
766 if mobj is not None:
767 video_uploader_id = mobj.group(1)
768 else:
69ea8ca4 769 self._downloader.report_warning('unable to extract uploader nickname')
c5e8d7af
PH
770
771 # title
a8c6b241 772 if 'title' in video_info:
aa92f063 773 video_title = video_info['title'][0]
a8c6b241 774 else:
69ea8ca4 775 self._downloader.report_warning('Unable to extract video title')
78caa52a 776 video_title = '_'
c5e8d7af
PH
777
778 # thumbnail image
7763b04e
JMF
779 # We try first to get a high quality image:
780 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
781 video_webpage, re.DOTALL)
782 if m_thumb is not None:
783 video_thumbnail = m_thumb.group(1)
784 elif 'thumbnail_url' not in video_info:
69ea8ca4 785 self._downloader.report_warning('unable to extract video thumbnail')
f490e77e 786 video_thumbnail = None
c5e8d7af
PH
787 else: # don't panic if we can't find it
788 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
789
790 # upload date
791 upload_date = None
ad3bc6ac 792 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
beee53de
PH
793 if mobj is None:
794 mobj = re.search(
263bd4ec 795 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
beee53de 796 video_webpage)
c5e8d7af
PH
797 if mobj is not None:
798 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
799 upload_date = unified_strdate(upload_date)
800
55f7bd2d
PH
801 m_cat_container = self._search_regex(
802 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
803 video_webpage, 'categories', fatal=False)
ec8deefc 804 if m_cat_container:
ad3bc6ac 805 category = self._html_search_regex(
01ed5c9b 806 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
807 default=None)
808 video_categories = None if category is None else [category]
809 else:
810 video_categories = None
ec8deefc 811
c5e8d7af
PH
812 # description
813 video_description = get_element_by_id("eow-description", video_webpage)
814 if video_description:
27dcce19
PH
815 video_description = re.sub(r'''(?x)
816 <a\s+
817 (?:[a-zA-Z-]+="[^"]+"\s+)*?
818 title="([^"]+)"\s+
819 (?:[a-zA-Z-]+="[^"]+"\s+)*?
820 class="yt-uix-redirect-link"\s*>
821 [^<]+
822 </a>
823 ''', r'\1', video_description)
c5e8d7af
PH
824 video_description = clean_html(video_description)
825 else:
826 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
827 if fd_mobj:
828 video_description = unescapeHTML(fd_mobj.group(1))
829 else:
78caa52a 830 video_description = ''
c5e8d7af 831
f30a38be 832 def _extract_count(count_name):
46374a56 833 count = self._search_regex(
f30a38be
JMF
834 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
835 video_webpage, count_name, default=None)
336c3a69
JMF
836 if count is not None:
837 return int(count.replace(',', ''))
838 return None
69ea8ca4
PH
839 like_count = _extract_count('like')
840 dislike_count = _extract_count('dislike')
336c3a69 841
c5e8d7af 842 # subtitles
d82134c3 843 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 844
c5e8d7af 845 if self._downloader.params.get('listsubtitles', False):
d665f8d3 846 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
847 return
848
849 if 'length_seconds' not in video_info:
69ea8ca4 850 self._downloader.report_warning('unable to extract video duration')
b466b702 851 video_duration = None
c5e8d7af 852 else:
b466b702 853 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 854
1fb07d10
JG
855 # annotations
856 video_annotations = None
857 if self._downloader.params.get('writeannotations', False):
5f6a1245 858 video_annotations = self._extract_annotations(video_id)
1fb07d10 859
dd27fd17
PH
860 def _map_to_format_list(urlmap):
861 formats = []
862 for itag, video_real_url in urlmap.items():
863 dct = {
864 'format_id': itag,
865 'url': video_real_url,
866 'player_url': player_url,
867 }
0b65e5d4
PH
868 if itag in self._formats:
869 dct.update(self._formats[itag])
dd27fd17
PH
870 formats.append(dct)
871 return formats
872
c5e8d7af
PH
873 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
874 self.report_rtmp_download()
dd27fd17
PH
875 formats = [{
876 'format_id': '_rtmp',
877 'protocol': 'rtmp',
878 'url': video_info['conn'][0],
879 'player_url': player_url,
880 }]
00fe14fc 881 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
5f6a1245 882 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 883 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 884 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 885 url_map = {}
00fe14fc 886 for url_data_str in encoded_url_map.split(','):
c5e8d7af 887 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
888 if 'itag' not in url_data or 'url' not in url_data:
889 continue
890 format_id = url_data['itag'][0]
891 url = url_data['url'][0]
892
893 if 'sig' in url_data:
894 url += '&signature=' + url_data['sig'][0]
895 elif 's' in url_data:
896 encrypted_sig = url_data['s'][0]
897
898 if not age_gate:
899 jsplayer_url_json = self._search_regex(
900 r'"assets":.+?"js":\s*("[^"]+")',
78caa52a 901 video_webpage, 'JS player URL')
201e9eaa
PH
902 player_url = json.loads(jsplayer_url_json)
903 if player_url is None:
904 player_url_json = self._search_regex(
905 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 906 video_webpage, 'age gate player URL')
201e9eaa
PH
907 player_url = json.loads(player_url_json)
908
909 if self._downloader.params.get('verbose'):
cf010131 910 if player_url is None:
201e9eaa
PH
911 player_version = 'unknown'
912 player_desc = 'unknown'
913 else:
914 if player_url.endswith('swf'):
915 player_version = self._search_regex(
916 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 917 'flash player', fatal=False)
201e9eaa 918 player_desc = 'flash player %s' % player_version
cf010131 919 else:
201e9eaa
PH
920 player_version = self._search_regex(
921 r'html5player-([^/]+?)(?:/html5player)?\.js',
922 player_url,
923 'html5 player', fatal=False)
78caa52a 924 player_desc = 'html5 player %s' % player_version
201e9eaa 925
60064c53 926 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 927 self.to_screen('{%s} signature length %s, %s' %
9e1a5b84 928 (format_id, parts_sizes, player_desc))
201e9eaa
PH
929
930 signature = self._decrypt_signature(
931 encrypted_sig, video_id, player_url, age_gate)
932 url += '&signature=' + signature
933 if 'ratebypass' not in url:
934 url += '&ratebypass=yes'
935 url_map[format_id] = url
dd27fd17 936 formats = _map_to_format_list(url_map)
1d043b93
JMF
937 elif video_info.get('hlsvp'):
938 manifest_url = video_info['hlsvp'][0]
939 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 940 formats = _map_to_format_list(url_map)
c5e8d7af 941 else:
69ea8ca4 942 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 943
dd27fd17 944 # Look for the DASH manifest
203fb43f 945 if self._downloader.params.get('youtube_include_dash_manifest', True):
dd27fd17 946 try:
d68f0cdb 947 # The DASH manifest used needs to be the one from the original video_webpage.
948 # The one found in get_video_info seems to be using different signatures.
949 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
950 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
951 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
7160532d 952 dash_manifest_url = video_info.get('dashmpd')[0]
5f6a1245 953
d68f0cdb 954 def decrypt_sig(mobj):
955 s = mobj.group(1)
956 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
957 return '/signature/%s' % dec_s
958 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
dd27fd17 959 dash_doc = self._download_xml(
d68f0cdb 960 dash_manifest_url, video_id,
69ea8ca4
PH
961 note='Downloading DASH manifest',
962 errnote='Could not download DASH manifest')
963 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
dd27fd17
PH
964 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
965 if url_el is None:
966 continue
967 format_id = r.attrib['id']
968 video_url = url_el.text
969 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
970 f = {
971 'format_id': format_id,
972 'url': video_url,
973 'width': int_or_none(r.attrib.get('width')),
974 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
975 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
976 'filesize': filesize,
c41a9650 977 'fps': int_or_none(r.attrib.get('frameRate')),
dd27fd17
PH
978 }
979 try:
980 existing_format = next(
981 fo for fo in formats
982 if fo['format_id'] == format_id)
983 except StopIteration:
984 f.update(self._formats.get(format_id, {}))
985 formats.append(f)
986 else:
987 existing_format.update(f)
988
989 except (ExtractorError, KeyError) as e:
23ad44b5 990 self.report_warning('Skipping DASH manifest: %r' % e, video_id)
d80044c2 991
4bcc7bd1 992 self._sort_formats(formats)
4ea3be0a 993
994 return {
8bcc8756
JW
995 'id': video_id,
996 'uploader': video_uploader,
997 'uploader_id': video_uploader_id,
998 'upload_date': upload_date,
999 'title': video_title,
1000 'thumbnail': video_thumbnail,
1001 'description': video_description,
1002 'categories': video_categories,
1003 'subtitles': video_subtitles,
1004 'duration': video_duration,
1005 'age_limit': 18 if age_gate else 0,
1006 'annotations': video_annotations,
7e8c0af0 1007 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 1008 'view_count': view_count,
4ea3be0a 1009 'like_count': like_count,
1010 'dislike_count': dislike_count,
8bcc8756 1011 'formats': formats,
4ea3be0a 1012 }
c5e8d7af 1013
5f6a1245 1014
880e1c52 1015class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
78caa52a 1016 IE_DESC = 'YouTube.com playlists'
d67cc9fa 1017 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1018 (?:https?://)?
1019 (?:\w+\.)?
1020 youtube\.com/
1021 (?:
ac7553d0 1022 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
c5e8d7af
PH
1023 \? (?:.*?&)*? (?:p|a|list)=
1024 | p/
1025 )
d67cc9fa 1026 (
7d568f5a 1027 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
5f6a1245 1028 # Top tracks, they can also include dots
d67cc9fa
JMF
1029 |(?:MC)[\w\.]*
1030 )
c5e8d7af
PH
1031 .*
1032 |
7d568f5a 1033 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1034 )"""
dbb94fb0 1035 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dcbb4580 1036 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
dbb94fb0 1037 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
78caa52a 1038 IE_NAME = 'youtube:playlist'
81127aa5
PH
1039 _TESTS = [{
1040 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1041 'info_dict': {
1042 'title': 'ytdl test PL',
a1cf99d0 1043 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
81127aa5
PH
1044 },
1045 'playlist_count': 3,
9291475f
PH
1046 }, {
1047 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1048 'info_dict': {
1049 'title': 'YDL_Empty_List',
1050 },
1051 'playlist_count': 0,
1052 }, {
1053 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1054 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1055 'info_dict': {
1056 'title': '29C3: Not my department',
1057 },
1058 'playlist_count': 95,
1059 }, {
1060 'note': 'issue #673',
1061 'url': 'PLBB231211A4F62143',
1062 'info_dict': {
f46a8702 1063 'title': '[OLD]Team Fortress 2 (Class-based LP)',
9291475f
PH
1064 },
1065 'playlist_mincount': 26,
1066 }, {
1067 'note': 'Large playlist',
1068 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1069 'info_dict': {
1070 'title': 'Uploads from Cauchemar',
1071 },
1072 'playlist_mincount': 799,
1073 }, {
1074 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1075 'info_dict': {
1076 'title': 'YDL_safe_search',
1077 },
1078 'playlist_count': 2,
ac7553d0
PH
1079 }, {
1080 'note': 'embedded',
1081 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1082 'playlist_count': 4,
1083 'info_dict': {
1084 'title': 'JODA15',
1085 }
6b08cdf6
PH
1086 }, {
1087 'note': 'Embedded SWF player',
1088 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1089 'playlist_count': 4,
1090 'info_dict': {
1091 'title': 'JODA7',
1092 }
81127aa5 1093 }]
c5e8d7af 1094
880e1c52
JMF
1095 def _real_initialize(self):
1096 self._login()
1097
652cdaa2 1098 def _ids_to_results(self, ids):
c9cc0bf5
PH
1099 return [
1100 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1101 for vid_id in ids]
652cdaa2
JMF
1102
1103 def _extract_mix(self, playlist_id):
1104 # The mixes are generated from a a single video
1105 # the id of the playlist is just 'RD' + video_id
7d4afc55 1106 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
c9cc0bf5 1107 webpage = self._download_webpage(
78caa52a 1108 url, playlist_id, 'Downloading Youtube mix')
bc2f773b 1109 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
1110 title_span = (
1111 search_title('playlist-title') or
1112 search_title('title long-title') or
1113 search_title('title'))
76d1700b 1114 title = clean_html(title_span)
c9cc0bf5
PH
1115 ids = orderedSet(re.findall(
1116 r'''(?xs)data-video-username=".*?".*?
1117 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1118 webpage))
652cdaa2
JMF
1119 url_results = self._ids_to_results(ids)
1120
1121 return self.playlist_result(url_results, playlist_id, title)
1122
c5e8d7af
PH
1123 def _real_extract(self, url):
1124 # Extract playlist id
d67cc9fa 1125 mobj = re.match(self._VALID_URL, url)
c5e8d7af 1126 if mobj is None:
69ea8ca4 1127 raise ExtractorError('Invalid URL: %s' % url)
47192f92
FV
1128 playlist_id = mobj.group(1) or mobj.group(2)
1129
1130 # Check if it's a video-specific URL
7c61bd36 1131 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1132 if 'v' in query_dict:
1133 video_id = query_dict['v'][0]
1134 if self._downloader.params.get('noplaylist'):
69ea8ca4 1135 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1136 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92 1137 else:
69ea8ca4 1138 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1139
7d4afc55 1140 if playlist_id.startswith('RD'):
652cdaa2
JMF
1141 # Mixes require a custom extraction process
1142 return self._extract_mix(playlist_id)
0a688bc0 1143 if playlist_id.startswith('TL'):
69ea8ca4 1144 raise ExtractorError('For downloading YouTube.com top lists, use '
8bcc8756 1145 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1146
dbb94fb0
S
1147 url = self._TEMPLATE_URL % playlist_id
1148 page = self._download_webpage(url, playlist_id)
1149 more_widget_html = content_html = page
1150
10c0e2d8 1151 # Check if the playlist exists or is private
e399853d 1152 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
10c0e2d8 1153 raise ExtractorError(
78caa52a 1154 'The playlist doesn\'t exist or is private, use --username or '
10c0e2d8
JMF
1155 '--netrc to access it.',
1156 expected=True)
1157
dcbb4580
JMF
1158 # Extract the video ids from the playlist pages
1159 ids = []
c5e8d7af 1160
755eb032 1161 for page_num in itertools.count(1):
dbb94fb0 1162 matches = re.finditer(self._VIDEO_RE, content_html)
6e47b51e
JMF
1163 # We remove the duplicates and the link with index 0
1164 # (it's not the first video of the playlist)
1165 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1166 ids.extend(new_ids)
c5e8d7af 1167
dbb94fb0
S
1168 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1169 if not mobj:
c5e8d7af
PH
1170 break
1171
dbb94fb0 1172 more = self._download_json(
5912c639
PH
1173 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1174 'Downloading page #%s' % page_num,
1175 transform_source=uppercase_escape)
dbb94fb0
S
1176 content_html = more['content_html']
1177 more_widget_html = more['load_more_widget_html']
1178
1179 playlist_title = self._html_search_regex(
68eb8e90 1180 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
78caa52a 1181 page, 'title')
c5e8d7af 1182
652cdaa2 1183 url_results = self._ids_to_results(ids)
dcbb4580 1184 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1185
1186
0a688bc0 1187class YoutubeTopListIE(YoutubePlaylistIE):
78caa52a 1188 IE_NAME = 'youtube:toplist'
69ea8ca4 1189 IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"'
9e1a5b84 1190 ' (Example: "yttoplist:music:Top Tracks")')
0a688bc0 1191 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
cdc628a4
PH
1192 _TESTS = [{
1193 'url': 'yttoplist:music:Trending',
1194 'playlist_mincount': 5,
1195 'skip': 'Only works for logged-in users',
1196 }]
0a688bc0
JMF
1197
1198 def _real_extract(self, url):
1199 mobj = re.match(self._VALID_URL, url)
1200 channel = mobj.group('chann')
1201 title = mobj.group('title')
1202 query = compat_urllib_parse.urlencode({'title': title})
cdc628a4
PH
1203 channel_page = self._download_webpage(
1204 'https://www.youtube.com/%s' % channel, title)
1205 link = self._html_search_regex(
1206 r'''(?x)
1207 <a\s+href="([^"]+)".*?>\s*
1208 <span\s+class="branded-page-module-title-text">\s*
1209 <span[^>]*>.*?%s.*?</span>''' % re.escape(query),
1210 channel_page, 'list')
0a688bc0 1211 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
5f6a1245 1212
0a688bc0
JMF
1213 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1214 ids = []
1215 # sometimes the webpage doesn't contain the videos
1216 # retry until we get them
1217 for i in itertools.count(0):
78caa52a 1218 msg = 'Downloading Youtube mix'
0a688bc0
JMF
1219 if i > 0:
1220 msg += ', retry #%d' % i
c9cc0bf5 1221
0a688bc0
JMF
1222 webpage = self._download_webpage(url, title, msg)
1223 ids = orderedSet(re.findall(video_re, webpage))
1224 if ids:
1225 break
1226 url_results = self._ids_to_results(ids)
1227 return self.playlist_result(url_results, playlist_title=title)
1228
1229
c5e8d7af 1230class YoutubeChannelIE(InfoExtractor):
78caa52a 1231 IE_DESC = 'YouTube.com channels'
9ff67727 1232 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
c5e8d7af 1233 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
38c2e5b8 1234 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
78caa52a 1235 IE_NAME = 'youtube:channel'
cdc628a4
PH
1236 _TESTS = [{
1237 'note': 'paginated channel',
1238 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1239 'playlist_mincount': 91,
1240 }]
c5e8d7af
PH
1241
1242 def extract_videos_from_page(self, page):
1243 ids_in_page = []
1244 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1245 if mobj.group(1) not in ids_in_page:
1246 ids_in_page.append(mobj.group(1))
1247 return ids_in_page
1248
1249 def _real_extract(self, url):
9ff67727 1250 channel_id = self._match_id(url)
c5e8d7af 1251
c5e8d7af 1252 video_ids = []
b9643eed
JMF
1253 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1254 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1255 autogenerated = re.search(r'''(?x)
1256 class="[^"]*?(?:
1257 channel-header-autogenerated-label|
1258 yt-channel-title-autogenerated
1259 )[^"]*"''', channel_page) is not None
c5e8d7af 1260
b9643eed
JMF
1261 if autogenerated:
1262 # The videos are contained in a single page
1263 # the ajax pages can't be used, they are empty
1264 video_ids = self.extract_videos_from_page(channel_page)
1265 else:
1266 # Download all channel pages using the json-based channel_ajax query
1267 for pagenum in itertools.count(1):
1268 url = self._MORE_PAGES_URL % (pagenum, channel_id)
81c2f20b 1269 page = self._download_json(
69ea8ca4 1270 url, channel_id, note='Downloading page #%s' % pagenum,
81c2f20b
PH
1271 transform_source=uppercase_escape)
1272
b9643eed
JMF
1273 ids_in_page = self.extract_videos_from_page(page['content_html'])
1274 video_ids.extend(ids_in_page)
5f6a1245 1275
b9643eed
JMF
1276 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1277 break
c5e8d7af 1278
69ea8ca4 1279 self._downloader.to_screen('[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
c5e8d7af 1280
7012b23c
PH
1281 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1282 for video_id in video_ids]
1283 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1284
1285
1286class YoutubeUserIE(InfoExtractor):
78caa52a 1287 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
9ff67727 1288 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
38c2e5b8 1289 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
c5e8d7af 1290 _GDATA_PAGE_SIZE = 50
38c2e5b8 1291 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
78caa52a 1292 IE_NAME = 'youtube:user'
c5e8d7af 1293
cdc628a4
PH
1294 _TESTS = [{
1295 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1296 'playlist_mincount': 320,
1297 'info_dict': {
1298 'title': 'TheLinuxFoundation',
1299 }
1300 }, {
1301 'url': 'ytuser:phihag',
1302 'only_matching': True,
1303 }]
1304
e3ea4790 1305 @classmethod
f4b05232 1306 def suitable(cls, url):
e3ea4790
JMF
1307 # Don't return True if the url can be extracted with other youtube
1308 # extractor, the regex would is too permissive and it would match.
1309 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
5f6a1245
JW
1310 if any(ie.suitable(url) for ie in other_ies):
1311 return False
1312 else:
1313 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 1314
c5e8d7af 1315 def _real_extract(self, url):
9ff67727 1316 username = self._match_id(url)
c5e8d7af
PH
1317
1318 # Download video ids using YouTube Data API. Result size per
1319 # query is limited (currently to 50 videos) so we need to query
1320 # page by page until there are no video ids - it means we got
1321 # all of them.
1322
b7ab0590 1323 def download_page(pagenum):
c5e8d7af
PH
1324 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1325
1326 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1327 page = self._download_webpage(
1328 gdata_url, username,
78caa52a 1329 'Downloading video ids from %d to %d' % (
b7ab0590 1330 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1331
fd9cf738
JMF
1332 try:
1333 response = json.loads(page)
1334 except ValueError as err:
69ea8ca4 1335 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
71c82637 1336 if 'entry' not in response['feed']:
b7ab0590 1337 return
fd9cf738 1338
c5e8d7af 1339 # Extract video identifiers
e302f9ce
PH
1340 entries = response['feed']['entry']
1341 for entry in entries:
1342 title = entry['title']['$t']
1343 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1344 yield {
e302f9ce
PH
1345 '_type': 'url',
1346 'url': video_id,
1347 'ie_key': 'Youtube',
b11cec41 1348 'id': video_id,
e302f9ce 1349 'title': title,
b7ab0590 1350 }
9c44d242 1351 url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1352
7012b23c
PH
1353 return self.playlist_result(url_results, playlist_title=username)
1354
b05654f0
PH
1355
1356class YoutubeSearchIE(SearchInfoExtractor):
78caa52a
PH
1357 IE_DESC = 'YouTube.com searches'
1358 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
b05654f0 1359 _MAX_RESULTS = 1000
78caa52a 1360 IE_NAME = 'youtube:search'
b05654f0
PH
1361 _SEARCH_KEY = 'ytsearch'
1362
b05654f0
PH
1363 def _get_n_results(self, query, n):
1364 """Get a specified number of results for a query"""
1365
1366 video_ids = []
1367 pagenum = 0
1368 limit = n
83d548ef 1369 PAGE_SIZE = 50
b05654f0 1370
83d548ef
PH
1371 while (PAGE_SIZE * pagenum) < limit:
1372 result_url = self._API_URL % (
1373 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1374 (PAGE_SIZE * pagenum) + 1)
7cc3570e 1375 data_json = self._download_webpage(
69ea8ca4
PH
1376 result_url, video_id='query "%s"' % query,
1377 note='Downloading page %s' % (pagenum + 1),
1378 errnote='Unable to download API page')
7cc3570e
PH
1379 data = json.loads(data_json)
1380 api_response = data['data']
1381
1382 if 'items' not in api_response:
07ad22b8 1383 raise ExtractorError(
78caa52a 1384 '[youtube] No video results', expected=True)
b05654f0
PH
1385
1386 new_ids = list(video['id'] for video in api_response['items'])
1387 video_ids += new_ids
1388
1389 limit = min(n, api_response['totalItems'])
1390 pagenum += 1
1391
1392 if len(video_ids) > n:
1393 video_ids = video_ids[:n]
7012b23c
PH
1394 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1395 for video_id in video_ids]
b05654f0 1396 return self.playlist_result(videos, query)
75dff0ee 1397
c9ae7b95 1398
a3dd9248 1399class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1400 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1401 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1402 _SEARCH_KEY = 'ytsearchdate'
78caa52a 1403 IE_DESC = 'YouTube.com searches, newest videos first'
75dff0ee 1404
c9ae7b95
PH
1405
1406class YoutubeSearchURLIE(InfoExtractor):
78caa52a
PH
1407 IE_DESC = 'YouTube.com search URLs'
1408 IE_NAME = 'youtube:search_url'
c9ae7b95 1409 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
1410 _TESTS = [{
1411 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1412 'playlist_mincount': 5,
1413 'info_dict': {
1414 'title': 'youtube-dl test video',
1415 }
1416 }]
c9ae7b95
PH
1417
1418 def _real_extract(self, url):
1419 mobj = re.match(self._VALID_URL, url)
1420 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1421
1422 webpage = self._download_webpage(url, query)
1423 result_code = self._search_regex(
78caa52a 1424 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
c9ae7b95
PH
1425
1426 part_codes = re.findall(
1427 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1428 entries = []
1429 for part_code in part_codes:
1430 part_title = self._html_search_regex(
6feb2d5e 1431 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1432 part_url_snippet = self._html_search_regex(
1433 r'(?s)href="([^"]+)"', part_code, 'item URL')
1434 part_url = compat_urlparse.urljoin(
1435 'https://www.youtube.com/', part_url_snippet)
1436 entries.append({
1437 '_type': 'url',
1438 'url': part_url,
1439 'title': part_title,
1440 })
1441
1442 return {
1443 '_type': 'playlist',
1444 'entries': entries,
1445 'title': query,
1446 }
1447
1448
75dff0ee 1449class YoutubeShowIE(InfoExtractor):
78caa52a 1450 IE_DESC = 'YouTube.com (multi-season) shows'
cdc628a4 1451 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 1452 IE_NAME = 'youtube:show'
cdc628a4
PH
1453 _TESTS = [{
1454 'url': 'http://www.youtube.com/show/airdisasters',
1455 'playlist_mincount': 3,
1456 'info_dict': {
1457 'id': 'airdisasters',
1458 'title': 'Air Disasters',
1459 }
1460 }]
75dff0ee
JMF
1461
1462 def _real_extract(self, url):
1463 mobj = re.match(self._VALID_URL, url)
cdc628a4
PH
1464 playlist_id = mobj.group('id')
1465 webpage = self._download_webpage(
1466 url, playlist_id, 'Downloading show webpage')
75dff0ee
JMF
1467 # There's one playlist for each season of the show
1468 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
cdc628a4
PH
1469 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1470 entries = [
1471 self.url_result(
1472 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1473 for season in m_seasons
1474 ]
1475 title = self._og_search_title(webpage, fatal=False)
1476
1477 return {
1478 '_type': 'playlist',
1479 'id': playlist_id,
1480 'title': title,
1481 'entries': entries,
1482 }
04cc9617
JMF
1483
1484
b2e8bc1b 1485class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1486 """
1487 Base class for extractors that fetch info from
1488 http://www.youtube.com/feed_ajax
1489 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1490 """
b2e8bc1b 1491 _LOGIN_REQUIRED = True
43ba5456
JMF
1492 # use action_load_personal_feed instead of action_load_system_feed
1493 _PERSONAL_FEED = False
04cc9617 1494
d7ae0639
JMF
1495 @property
1496 def _FEED_TEMPLATE(self):
43ba5456
JMF
1497 action = 'action_load_system_feed'
1498 if self._PERSONAL_FEED:
1499 action = 'action_load_personal_feed'
38c2e5b8 1500 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1501
1502 @property
1503 def IE_NAME(self):
78caa52a 1504 return 'youtube:%s' % self._FEED_NAME
04cc9617 1505
81f0259b 1506 def _real_initialize(self):
b2e8bc1b 1507 self._login()
81f0259b 1508
04cc9617
JMF
1509 def _real_extract(self, url):
1510 feed_entries = []
0e44d838
JMF
1511 paging = 0
1512 for i in itertools.count(1):
f6177462 1513 info = self._download_json(self._FEED_TEMPLATE % paging,
8bcc8756
JW
1514 '%s feed' % self._FEED_NAME,
1515 'Downloading page %s' % i)
f6177462 1516 feed_html = info.get('feed_html') or info.get('content_html')
1a9b9649 1517 load_more_widget_html = info.get('load_more_widget_html') or feed_html
43ba5456 1518 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1519 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1520 feed_entries.extend(
1521 self.url_result(video_id, 'Youtube', video_id=video_id)
1522 for video_id in ids)
05ee2b6d
JMF
1523 mobj = re.search(
1524 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1a9b9649 1525 load_more_widget_html)
05ee2b6d 1526 if mobj is None:
04cc9617 1527 break
05ee2b6d 1528 paging = mobj.group('paging')
d7ae0639
JMF
1529 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1530
5f6a1245 1531
d7ae0639 1532class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
f3a34072 1533 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
d7ae0639
JMF
1534 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1535 _FEED_NAME = 'recommended'
78caa52a 1536 _PLAYLIST_TITLE = 'Youtube Recommended videos'
c626a3d9 1537
5f6a1245 1538
43ba5456 1539class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
f3a34072 1540 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
43ba5456
JMF
1541 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1542 _FEED_NAME = 'watch_later'
78caa52a 1543 _PLAYLIST_TITLE = 'Youtube Watch Later'
43ba5456 1544 _PERSONAL_FEED = True
c626a3d9 1545
5f6a1245 1546
f459d170 1547class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
f3a34072 1548 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
78caa52a 1549 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
f459d170
JMF
1550 _FEED_NAME = 'history'
1551 _PERSONAL_FEED = True
78caa52a 1552 _PLAYLIST_TITLE = 'Youtube Watch History'
f459d170 1553
5f6a1245 1554
c626a3d9 1555class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 1556 IE_NAME = 'youtube:favorites'
f3a34072 1557 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
c7a7750d 1558 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1559 _LOGIN_REQUIRED = True
1560
1561 def _real_extract(self, url):
1562 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 1563 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 1564 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1565
1566
1ed5b5c9 1567class YoutubeSubscriptionsIE(YoutubePlaylistIE):
78caa52a
PH
1568 IE_NAME = 'youtube:subscriptions'
1569 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1ed5b5c9 1570 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
81127aa5 1571 _TESTS = []
1ed5b5c9
JMF
1572
1573 def _real_extract(self, url):
78caa52a 1574 title = 'Youtube Subscriptions'
1ed5b5c9
JMF
1575 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1576
1577 # The extraction process is the same as for playlists, but the regex
1578 # for the video ids doesn't contain an index
1579 ids = []
1580 more_widget_html = content_html = page
1581
1582 for page_num in itertools.count(1):
1583 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1584 new_ids = orderedSet(matches)
1585 ids.extend(new_ids)
1586
1587 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1588 if not mobj:
1589 break
1590
1591 more = self._download_json(
1592 'https://youtube.com/%s' % mobj.group('more'), title,
1593 'Downloading page #%s' % page_num,
1594 transform_source=uppercase_escape)
1595 content_html = more['content_html']
1596 more_widget_html = more['load_more_widget_html']
1597
1598 return {
1599 '_type': 'playlist',
1600 'title': title,
1601 'entries': self._ids_to_results(ids),
1602 }
1603
1604
15870e90
PH
1605class YoutubeTruncatedURLIE(InfoExtractor):
1606 IE_NAME = 'youtube:truncated_url'
1607 IE_DESC = False # Do not list
975d35db 1608 _VALID_URL = r'''(?x)
c4808c60
PH
1609 (?:https?://)?[^/]+/watch\?(?:
1610 feature=[a-z_]+|
1611 annotation_id=annotation_[^&]+
1612 )?$|
975d35db
PH
1613 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1614 '''
15870e90 1615
c4808c60
PH
1616 _TESTS = [{
1617 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1618 'only_matching': True,
dc2fc736
PH
1619 }, {
1620 'url': 'http://www.youtube.com/watch?',
1621 'only_matching': True,
c4808c60
PH
1622 }]
1623
15870e90
PH
1624 def _real_extract(self, url):
1625 raise ExtractorError(
78caa52a
PH
1626 'Did you forget to quote the URL? Remember that & is a meta '
1627 'character in most shells, so you want to put the URL in quotes, '
1628 'like youtube-dl '
1629 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1630 ' or simply youtube-dl BaW_jenozKc .',
15870e90 1631 expected=True)