]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[smotri:broadcast] Fix extraction
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
42939b61 10import time
e0df6211 11import traceback
c5e8d7af 12
b05654f0 13from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 14from .subtitles import SubtitlesInfoExtractor
2b25cb5d 15from ..jsinterp import JSInterpreter
54256267 16from ..swfinterp import SWFInterpreter
c5e8d7af 17from ..utils import (
edf3e38e 18 compat_chr,
c5e8d7af 19 compat_parse_qs,
c5e8d7af
PH
20 compat_urllib_parse,
21 compat_urllib_request,
7c61bd36 22 compat_urlparse,
c5e8d7af
PH
23 compat_str,
24
25 clean_html,
26 get_element_by_id,
652cdaa2 27 get_element_by_attribute,
c5e8d7af 28 ExtractorError,
dd27fd17 29 int_or_none,
9c44d242 30 OnDemandPagedList,
c5e8d7af
PH
31 unescapeHTML,
32 unified_strdate,
04cc9617 33 orderedSet,
81c2f20b 34 uppercase_escape,
c5e8d7af
PH
35)
36
5f6a1245 37
de7f3446 38class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
39 """Provide base functions for Youtube extractors"""
40 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
83317f69 41 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
b2e8bc1b
JMF
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
45
b2e8bc1b 46 def _set_language(self):
810fb84d
PH
47 self._set_cookie(
48 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
42939b61 49 # YouTube sets the expire time to about two months
810fb84d 50 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b
JMF
51
52 def _login(self):
83317f69 53 """
54 Attempt to log in to YouTube.
55 True is returned if successful or skipped.
56 False is returned if login failed.
57
58 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
59 """
b2e8bc1b
JMF
60 (username, password) = self._get_login_info()
61 # No authentication to be performed
62 if username is None:
63 if self._LOGIN_REQUIRED:
69ea8ca4 64 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 65 return True
b2e8bc1b 66
7cc3570e
PH
67 login_page = self._download_webpage(
68 self._LOGIN_URL, None,
69ea8ca4
PH
69 note='Downloading login page',
70 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
71 if login_page is False:
72 return
b2e8bc1b 73
795f28f8 74 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78caa52a 75 login_page, 'Login GALX parameter')
c5e8d7af 76
b2e8bc1b
JMF
77 # Log in
78 login_form_strs = {
8bcc8756
JW
79 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
80 'Email': username,
81 'GALX': galx,
82 'Passwd': password,
83
84 'PersistentCookie': 'yes',
85 '_utf8': '霱',
86 'bgresponse': 'js_disabled',
87 'checkConnection': '',
88 'checkedDomains': 'youtube',
89 'dnConn': '',
90 'pstMsg': '0',
91 'rmShown': '1',
92 'secTok': '',
93 'signIn': 'Sign in',
94 'timeStmp': '',
95 'service': 'youtube',
96 'uilel': '3',
97 'hl': 'en_US',
b2e8bc1b 98 }
83317f69 99
b2e8bc1b
JMF
100 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
101 # chokes on unicode
5f6a1245 102 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
b2e8bc1b 103 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
104
105 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
106 login_results = self._download_webpage(
107 req, None,
69ea8ca4 108 note='Logging in', errnote='unable to log in', fatal=False)
7cc3570e
PH
109 if login_results is False:
110 return False
83317f69 111
112 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
69ea8ca4 113 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
83317f69 114
115 # Two-Factor
116 # TODO add SMS and phone call support - these require making a request and then prompting the user
117
118 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
119 tfa_code = self._get_tfa_info()
120
121 if tfa_code is None:
69ea8ca4
PH
122 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
123 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
83317f69 124 return False
125
126 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
127
128 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
129 if match is None:
69ea8ca4 130 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
83317f69 131 secTok = match.group(1)
132 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
133 if match is None:
69ea8ca4 134 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
83317f69 135 timeStmp = match.group(1)
136
137 tfa_form_strs = {
78caa52a
PH
138 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
139 'smsToken': '',
140 'smsUserPin': tfa_code,
141 'smsVerifyPin': 'Verify',
142
143 'PersistentCookie': 'yes',
144 'checkConnection': '',
145 'checkedDomains': 'youtube',
146 'pstMsg': '1',
147 'secTok': secTok,
148 'timeStmp': timeStmp,
149 'service': 'youtube',
150 'hl': 'en_US',
83317f69 151 }
5f6a1245 152 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
83317f69 153 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
154
155 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
156 tfa_results = self._download_webpage(
157 tfa_req, None,
69ea8ca4 158 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
83317f69 159
160 if tfa_results is False:
161 return False
162
163 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
69ea8ca4 164 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
83317f69 165 return False
166 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
69ea8ca4 167 self._downloader.report_warning('unable to log in - did the page structure change?')
83317f69 168 return False
169 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
69ea8ca4 170 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
83317f69 171 return False
172
7cc3570e 173 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
69ea8ca4 174 self._downloader.report_warning('unable to log in: bad username or password')
b2e8bc1b
JMF
175 return False
176 return True
177
b2e8bc1b
JMF
178 def _real_initialize(self):
179 if self._downloader is None:
180 return
42939b61 181 self._set_language()
b2e8bc1b
JMF
182 if not self._login():
183 return
c5e8d7af 184
8377574c 185
de7f3446 186class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
78caa52a 187 IE_DESC = 'YouTube.com'
cb7dfeea 188 _VALID_URL = r"""(?x)^
c5e8d7af 189 (
edb53e2d 190 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 191 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 192 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 193 (?:www\.)?pwnyoutube\.com/|
f7000f3a 194 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
195 tube\.majestyc\.net/|
196 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
197 (?:.*?\#/)? # handle anchor (#/) redirect urls
198 (?: # the various things that can precede the ID:
ac7553d0 199 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 200 |(?: # or the v= param in all its forms
f7000f3a 201 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
202 (?:\?|\#!?) # the params delimiter ? or # or #!
203 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
204 v=
205 )
f4b05232
JMF
206 ))
207 |youtu\.be/ # just youtu.be/xxxx
edb53e2d 208 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 209 )
c5e8d7af 210 )? # all until now is optional -> you can pass the naked ID
8963d9c2 211 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
9291475f 212 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
c5e8d7af
PH
213 (?(1).+)? # if we found the ID, everything can follow
214 $"""
c5e8d7af 215 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
216 _formats = {
217 '5': {'ext': 'flv', 'width': 400, 'height': 240},
218 '6': {'ext': 'flv', 'width': 450, 'height': 270},
219 '13': {'ext': '3gp'},
220 '17': {'ext': '3gp', 'width': 176, 'height': 144},
221 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
222 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
223 '34': {'ext': 'flv', 'width': 640, 'height': 360},
224 '35': {'ext': 'flv', 'width': 854, 'height': 480},
225 '36': {'ext': '3gp', 'width': 320, 'height': 240},
226 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
227 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
228 '43': {'ext': 'webm', 'width': 640, 'height': 360},
229 '44': {'ext': 'webm', 'width': 854, 'height': 480},
230 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
231 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
232
1d043b93 233
86fe61c8 234 # 3d videos
43b81eb9
PH
235 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
236 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
237 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
238 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
239 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
240 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
241 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 242
96fb5605 243 # Apple HTTP Live Streaming
43b81eb9
PH
244 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
245 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
246 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
247 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
248 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
249 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
250 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
251
252 # DASH mp4 video
43b81eb9
PH
253 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
254 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
255 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
256 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
257 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
258 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
259 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
260 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
0d2c1418
PH
261 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
262 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
263 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
836a086c 264
f6f1fc92 265 # Dash mp4 audio
2c62dc26
PH
266 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
267 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
268 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
269
270 # Dash webm
e75cafe9
A
271 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
272 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
273 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
274 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
275 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
276 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
1cc887cb 277 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
e75cafe9
A
278 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
279 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
280 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
281 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
282 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
283 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
284 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 285 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 286 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
a51d3aa0
PH
287 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
288 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
10a404c3 289 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
2c62dc26
PH
290
291 # Dash webm audio
55db73ef 292 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
e75cafe9 293 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d 294
0857baad
PH
295 # Dash webm audio with opus inside
296 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
297 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
298 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
299
ce6b9a2d
PH
300 # RTMP (unnamed)
301 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 302 }
836a086c 303
78caa52a 304 IE_NAME = 'youtube'
2eb88d95
PH
305 _TESTS = [
306 {
4bc3a23e
PH
307 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
308 'info_dict': {
309 'id': 'BaW_jenozKc',
310 'ext': 'mp4',
311 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
312 'uploader': 'Philipp Hagemeister',
313 'uploader_id': 'phihag',
314 'upload_date': '20121002',
315 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
316 'categories': ['Science & Technology'],
3e7c1224
PH
317 'like_count': int,
318 'dislike_count': int,
2eb88d95 319 }
0e853ca4 320 },
0e853ca4 321 {
4bc3a23e
PH
322 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
323 'note': 'Test generic use_cipher_signature video (#897)',
324 'info_dict': {
325 'id': 'UxxajLWwzqY',
326 'ext': 'mp4',
327 'upload_date': '20120506',
328 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
329 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
330 'uploader': 'Icona Pop',
331 'uploader_id': 'IconaPop',
2eb88d95 332 }
c108eb73
JMF
333 },
334 {
4bc3a23e
PH
335 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
336 'note': 'Test VEVO video with age protection (#956)',
337 'info_dict': {
338 'id': '07FYdnEawAQ',
339 'ext': 'mp4',
340 'upload_date': '20130703',
341 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
342 'description': 'md5:64249768eec3bc4276236606ea996373',
343 'uploader': 'justintimberlakeVEVO',
344 'uploader_id': 'justintimberlakeVEVO',
c108eb73
JMF
345 }
346 },
fccd3771 347 {
4bc3a23e
PH
348 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
349 'note': 'Embed-only video (#1746)',
350 'info_dict': {
351 'id': 'yZIXLfi8CZQ',
352 'ext': 'mp4',
353 'upload_date': '20120608',
354 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
355 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
356 'uploader': 'SET India',
357 'uploader_id': 'setindia'
fccd3771
PH
358 }
359 },
dd27fd17 360 {
4bc3a23e
PH
361 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
362 'note': '256k DASH audio (format 141) via DASH manifest',
363 'info_dict': {
364 'id': 'a9LDPn-MO4I',
365 'ext': 'm4a',
366 'upload_date': '20121002',
367 'uploader_id': '8KVIDEO',
368 'description': '',
369 'uploader': '8KVIDEO',
370 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 371 },
4bc3a23e
PH
372 'params': {
373 'youtube_include_dash_manifest': True,
374 'format': '141',
4919603f 375 },
dd27fd17 376 },
3489b7d2
JMF
377 # DASH manifest with encrypted signature
378 {
78caa52a
PH
379 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
380 'info_dict': {
381 'id': 'IB3lcPjvWLA',
382 'ext': 'm4a',
b766eb27
JMF
383 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
384 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
78caa52a
PH
385 'uploader': 'AfrojackVEVO',
386 'uploader_id': 'AfrojackVEVO',
387 'upload_date': '20131011',
3489b7d2 388 },
4bc3a23e 389 'params': {
78caa52a
PH
390 'youtube_include_dash_manifest': True,
391 'format': '141',
3489b7d2
JMF
392 },
393 },
aa79ac0c
PH
394 # Controversy video
395 {
396 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
397 'info_dict': {
398 'id': 'T4XJQO3qol8',
399 'ext': 'mp4',
400 'upload_date': '20100909',
401 'uploader': 'The Amazing Atheist',
402 'uploader_id': 'TheAmazingAtheist',
403 'title': 'Burning Everyone\'s Koran',
404 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
405 }
c522adb1
JMF
406 },
407 # Normal age-gate video (No vevo, embed allowed)
408 {
409 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
410 'info_dict': {
411 'id': 'HtVdAasjOgU',
412 'ext': 'mp4',
413 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
414 'description': 'md5:eca57043abae25130f58f655ad9a7771',
415 'uploader': 'The Witcher',
416 'uploader_id': 'WitcherGame',
417 'upload_date': '20140605',
418 },
419 },
774e208f
PH
420 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
421 {
422 'url': '__2ABJjxzNo',
423 'info_dict': {
424 'id': '__2ABJjxzNo',
425 'ext': 'mp4',
426 'upload_date': '20100430',
427 'uploader_id': 'deadmau5',
428 'description': 'md5:12c56784b8032162bb936a5f76d55360',
429 'uploader': 'deadmau5',
430 'title': 'Deadmau5 - Some Chords (HD)',
431 },
432 'expected_warnings': [
433 'DASH manifest missing',
434 ]
435 }
2eb88d95
PH
436 ]
437
e0df6211
PH
438 def __init__(self, *args, **kwargs):
439 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 440 self._player_cache = {}
e0df6211 441
c5e8d7af
PH
442 def report_video_info_webpage_download(self, video_id):
443 """Report attempt to download video info webpage."""
69ea8ca4 444 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 445
c5e8d7af
PH
446 def report_information_extraction(self, video_id):
447 """Report attempt to extract video information."""
69ea8ca4 448 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
449
450 def report_unavailable_format(self, video_id, format):
451 """Report extracted video URL."""
69ea8ca4 452 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
453
454 def report_rtmp_download(self):
455 """Indicate the download will use the RTMP protocol."""
69ea8ca4 456 self.to_screen('RTMP download detected')
c5e8d7af 457
60064c53
PH
458 def _signature_cache_id(self, example_sig):
459 """ Return a string representation of a signature """
78caa52a 460 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
461
462 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 463 id_m = re.match(
c081b35c 464 r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
cf010131 465 player_url)
c081b35c
PH
466 if not id_m:
467 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
468 player_type = id_m.group('ext')
469 player_id = id_m.group('id')
470
c4417ddb 471 # Read from filesystem cache
60064c53
PH
472 func_id = '%s_%s_%s' % (
473 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 474 assert os.path.basename(func_id) == func_id
a0e07d31 475
69ea8ca4 476 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 477 if cache_spec is not None:
78caa52a 478 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 479
e0df6211
PH
480 if player_type == 'js':
481 code = self._download_webpage(
482 player_url, video_id,
69ea8ca4
PH
483 note='Downloading %s player %s' % (player_type, player_id),
484 errnote='Download of %s failed' % player_url)
83799698 485 res = self._parse_sig_js(code)
c4417ddb 486 elif player_type == 'swf':
e0df6211
PH
487 urlh = self._request_webpage(
488 player_url, video_id,
69ea8ca4
PH
489 note='Downloading %s player %s' % (player_type, player_id),
490 errnote='Download of %s failed' % player_url)
e0df6211 491 code = urlh.read()
83799698 492 res = self._parse_sig_swf(code)
e0df6211
PH
493 else:
494 assert False, 'Invalid player type %r' % player_type
495
a0e07d31 496 if cache_spec is None:
78caa52a 497 test_string = ''.join(map(compat_chr, range(len(example_sig))))
a0e07d31
PH
498 cache_res = res(test_string)
499 cache_spec = [ord(c) for c in cache_res]
83799698 500
69ea8ca4 501 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
502 return res
503
60064c53 504 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
505 def gen_sig_code(idxs):
506 def _genslice(start, end, step):
78caa52a 507 starts = '' if start == 0 else str(start)
8bcc8756 508 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 509 steps = '' if step == 1 else (':%d' % step)
78caa52a 510 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
511
512 step = None
0ca96d48
PH
513 start = '(Never used)' # Quelch pyflakes warnings - start will be
514 # set as soon as step is set
edf3e38e
PH
515 for i, prev in zip(idxs[1:], idxs[:-1]):
516 if step is not None:
517 if i - prev == step:
518 continue
519 yield _genslice(start, prev, step)
520 step = None
521 continue
522 if i - prev in [-1, 1]:
523 step = i - prev
524 start = prev
525 continue
526 else:
78caa52a 527 yield 's[%d]' % prev
edf3e38e 528 if step is None:
78caa52a 529 yield 's[%d]' % i
edf3e38e
PH
530 else:
531 yield _genslice(start, i, step)
532
78caa52a 533 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 534 cache_res = func(test_string)
edf3e38e 535 cache_spec = [ord(c) for c in cache_res]
78caa52a 536 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
537 signature_id_tuple = '(%s)' % (
538 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 539 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 540 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 541 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 542
e0df6211
PH
543 def _parse_sig_js(self, jscode):
544 funcname = self._search_regex(
894dd868 545 r'\.sig\|\|([a-zA-Z0-9]+)\(', jscode,
8bcc8756 546 'Initial JS player signature function name')
2b25cb5d
PH
547
548 jsi = JSInterpreter(jscode)
549 initial_function = jsi.extract_function(funcname)
e0df6211
PH
550 return lambda s: initial_function([s])
551
552 def _parse_sig_swf(self, file_contents):
54256267 553 swfi = SWFInterpreter(file_contents)
78caa52a 554 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 555 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 556 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
557 return lambda s: initial_function([s])
558
83799698 559 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 560 """Turn the encrypted s field into a working signature"""
6b37f0be 561
c8bf86d5 562 if player_url is None:
69ea8ca4 563 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 564
69ea8ca4 565 if player_url.startswith('//'):
78caa52a 566 player_url = 'https:' + player_url
c8bf86d5 567 try:
62af3a0e 568 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
569 if player_id not in self._player_cache:
570 func = self._extract_signature_function(
60064c53 571 video_id, player_url, s
c8bf86d5
PH
572 )
573 self._player_cache[player_id] = func
574 func = self._player_cache[player_id]
575 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 576 self._print_sig_code(func, s)
c8bf86d5
PH
577 return func(s)
578 except Exception as e:
579 tb = traceback.format_exc()
580 raise ExtractorError(
78caa52a 581 'Signature extraction failed: ' + tb, cause=e)
e0df6211 582
1f343eaa 583 def _get_available_subtitles(self, video_id, webpage):
de7f3446 584 try:
7fad1c63 585 sub_list = self._download_webpage(
38c2e5b8 586 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
587 video_id, note=False)
588 except ExtractorError as err:
69ea8ca4 589 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
de7f3446
JMF
590 return {}
591 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
592
593 sub_lang_list = {}
594 for l in lang_list:
595 lang = l[1]
7e660ac1
LD
596 if lang in sub_lang_list:
597 continue
de7f3446
JMF
598 params = compat_urllib_parse.urlencode({
599 'lang': lang,
600 'v': video_id,
ca715127 601 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
c3197e3e 602 'name': unescapeHTML(l[0]).encode('utf-8'),
de7f3446 603 })
78caa52a 604 url = 'https://www.youtube.com/api/timedtext?' + params
de7f3446
JMF
605 sub_lang_list[lang] = url
606 if not sub_lang_list:
69ea8ca4 607 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
608 return {}
609 return sub_lang_list
610
055e6f36 611 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
612 """We need the webpage for getting the captions url, pass it as an
613 argument to speed up the process."""
ca715127 614 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
69ea8ca4 615 self.to_screen('%s: Looking for automatic captions' % video_id)
de7f3446 616 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
78caa52a 617 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
618 if mobj is None:
619 self._downloader.report_warning(err_msg)
620 return {}
621 player_config = json.loads(mobj.group(1))
622 try:
0792d563
PH
623 args = player_config['args']
624 caption_url = args['ttsurl']
625 timestamp = args['timestamp']
055e6f36
JMF
626 # We get the available subtitles
627 list_params = compat_urllib_parse.urlencode({
628 'type': 'list',
629 'tlangs': 1,
630 'asrs': 1,
de7f3446 631 })
055e6f36 632 list_url = caption_url + '&' + list_params
e26f8712 633 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 634 original_lang_node = caption_list.find('track')
5f6a1245 635 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr':
69ea8ca4 636 self._downloader.report_warning('Video doesn\'t have automatic captions')
e3dc22ca
JMF
637 return {}
638 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
639
640 sub_lang_list = {}
641 for lang_node in caption_list.findall('target'):
642 sub_lang = lang_node.attrib['lang_code']
643 params = compat_urllib_parse.urlencode({
644 'lang': original_lang,
645 'tlang': sub_lang,
646 'fmt': sub_format,
647 'ts': timestamp,
648 'kind': 'asr',
649 })
650 sub_lang_list[sub_lang] = caption_url + '&' + params
651 return sub_lang_list
de7f3446
JMF
652 # An extractor error can be raise by the download process if there are
653 # no automatic captions but there are subtitles
654 except (KeyError, ExtractorError):
655 self._downloader.report_warning(err_msg)
656 return {}
657
97665381
PH
658 @classmethod
659 def extract_id(cls, url):
660 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 661 if mobj is None:
69ea8ca4 662 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
663 video_id = mobj.group(2)
664 return video_id
665
1d043b93
JMF
666 def _extract_from_m3u8(self, manifest_url, video_id):
667 url_map = {}
5f6a1245 668
1d043b93
JMF
669 def _get_urls(_manifest):
670 lines = _manifest.split('\n')
671 urls = filter(lambda l: l and not l.startswith('#'),
8bcc8756 672 lines)
1d043b93 673 return urls
78caa52a 674 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
1d043b93
JMF
675 formats_urls = _get_urls(manifest)
676 for format_url in formats_urls:
890f62e8 677 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
678 url_map[itag] = format_url
679 return url_map
680
1fb07d10
JG
681 def _extract_annotations(self, video_id):
682 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 683 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 684
da276600
PH
685 def _parse_dash_manifest(
686 self, video_id, dash_manifest_url, player_url, age_gate):
774e208f
PH
687 def decrypt_sig(mobj):
688 s = mobj.group(1)
689 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
690 return '/signature/%s' % dec_s
691 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
692 dash_doc = self._download_xml(
693 dash_manifest_url, video_id,
694 note='Downloading DASH manifest',
695 errnote='Could not download DASH manifest')
696
697 formats = []
698 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
699 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
700 if url_el is None:
701 continue
702 format_id = r.attrib['id']
703 video_url = url_el.text
704 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
705 f = {
706 'format_id': format_id,
707 'url': video_url,
708 'width': int_or_none(r.attrib.get('width')),
709 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
710 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
711 'filesize': filesize,
712 'fps': int_or_none(r.attrib.get('frameRate')),
713 }
714 try:
715 existing_format = next(
716 fo for fo in formats
717 if fo['format_id'] == format_id)
718 except StopIteration:
719 f.update(self._formats.get(format_id, {}))
720 formats.append(f)
721 else:
722 existing_format.update(f)
723 return formats
724
c5e8d7af 725 def _real_extract(self, url):
7e8c0af0 726 proto = (
78caa52a
PH
727 'http' if self._downloader.params.get('prefer_insecure', False)
728 else 'https')
7e8c0af0 729
c5e8d7af
PH
730 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
731 mobj = re.search(self._NEXT_URL_RE, url)
732 if mobj:
7e8c0af0 733 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
97665381 734 video_id = self.extract_id(url)
c5e8d7af
PH
735
736 # Get video webpage
aa79ac0c 737 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
a1f934b1 738 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
739
740 # Attempt to extract SWF player URL
e0df6211 741 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
742 if mobj is not None:
743 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
744 else:
745 player_url = None
746
747 # Get video info
c108eb73 748 if re.search(r'player-age-gate-content">', video_webpage) is not None:
c108eb73
JMF
749 age_gate = True
750 # We simulate the access to the video from www.youtube.com/v/{video_id}
751 # this can be viewed without login into Youtube
2c57c7fa
JMF
752 data = compat_urllib_parse.urlencode({
753 'video_id': video_id,
754 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934 755 'sts': self._search_regex(
94bd3613 756 r'"sts"\s*:\s*(\d+)', video_webpage, 'sts', default=''),
2c57c7fa 757 })
7e8c0af0 758 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
94bd3613
PH
759 video_info_webpage = self._download_webpage(
760 video_info_url, video_id,
20436c30 761 note='Refetching age-gated info webpage',
94bd3613 762 errnote='unable to download video info webpage')
c5e8d7af 763 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
764 else:
765 age_gate = False
4e62ebe2
JMF
766 try:
767 # Try looking directly into the video webpage
768 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
769 if not mobj:
770 raise ValueError('Could not find ytplayer.config') # caught below
771 json_code = uppercase_escape(mobj.group(1))
772 ytplayer_config = json.loads(json_code)
773 args = ytplayer_config['args']
774 # Convert to the same format returned by compat_parse_qs
775 video_info = dict((k, [v]) for k, v in args.items())
776 if 'url_encoded_fmt_stream_map' not in args:
777 raise ValueError('No stream_map present') # caught below
778 except ValueError:
779 # We fallback to the get_video_info pages (used by the embed page)
780 self.report_video_info_webpage_download(video_id)
781 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
810fb84d
PH
782 video_info_url = (
783 '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
784 % (proto, video_id, el_type))
785 video_info_webpage = self._download_webpage(
786 video_info_url,
4e62ebe2
JMF
787 video_id, note=False,
788 errnote='unable to download video info webpage')
789 video_info = compat_parse_qs(video_info_webpage)
790 if 'token' in video_info:
791 break
c5e8d7af
PH
792 if 'token' not in video_info:
793 if 'reason' in video_info:
d11271dd 794 raise ExtractorError(
78caa52a 795 'YouTube said: %s' % video_info['reason'][0],
d11271dd 796 expected=True, video_id=video_id)
c5e8d7af 797 else:
d11271dd 798 raise ExtractorError(
78caa52a 799 '"token" parameter not in video info for unknown reason',
d11271dd 800 video_id=video_id)
c5e8d7af 801
1d699755
PH
802 if 'view_count' in video_info:
803 view_count = int(video_info['view_count'][0])
804 else:
805 view_count = None
806
c5e8d7af
PH
807 # Check for "rental" videos
808 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
69ea8ca4 809 raise ExtractorError('"rental" videos not supported')
c5e8d7af
PH
810
811 # Start extracting information
812 self.report_information_extraction(video_id)
813
814 # uploader
815 if 'author' not in video_info:
69ea8ca4 816 raise ExtractorError('Unable to extract uploader name')
c5e8d7af
PH
817 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
818
819 # uploader_id
820 video_uploader_id = None
821 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
822 if mobj is not None:
823 video_uploader_id = mobj.group(1)
824 else:
69ea8ca4 825 self._downloader.report_warning('unable to extract uploader nickname')
c5e8d7af
PH
826
827 # title
a8c6b241 828 if 'title' in video_info:
aa92f063 829 video_title = video_info['title'][0]
a8c6b241 830 else:
69ea8ca4 831 self._downloader.report_warning('Unable to extract video title')
78caa52a 832 video_title = '_'
c5e8d7af
PH
833
834 # thumbnail image
7763b04e
JMF
835 # We try first to get a high quality image:
836 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
837 video_webpage, re.DOTALL)
838 if m_thumb is not None:
839 video_thumbnail = m_thumb.group(1)
840 elif 'thumbnail_url' not in video_info:
69ea8ca4 841 self._downloader.report_warning('unable to extract video thumbnail')
f490e77e 842 video_thumbnail = None
c5e8d7af
PH
843 else: # don't panic if we can't find it
844 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
845
846 # upload date
847 upload_date = None
ad3bc6ac 848 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
beee53de
PH
849 if mobj is None:
850 mobj = re.search(
263bd4ec 851 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
beee53de 852 video_webpage)
c5e8d7af
PH
853 if mobj is not None:
854 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
855 upload_date = unified_strdate(upload_date)
856
55f7bd2d
PH
857 m_cat_container = self._search_regex(
858 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
859 video_webpage, 'categories', fatal=False)
ec8deefc 860 if m_cat_container:
ad3bc6ac 861 category = self._html_search_regex(
01ed5c9b 862 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
863 default=None)
864 video_categories = None if category is None else [category]
865 else:
866 video_categories = None
ec8deefc 867
c5e8d7af
PH
868 # description
869 video_description = get_element_by_id("eow-description", video_webpage)
870 if video_description:
27dcce19
PH
871 video_description = re.sub(r'''(?x)
872 <a\s+
873 (?:[a-zA-Z-]+="[^"]+"\s+)*?
874 title="([^"]+)"\s+
875 (?:[a-zA-Z-]+="[^"]+"\s+)*?
876 class="yt-uix-redirect-link"\s*>
877 [^<]+
878 </a>
879 ''', r'\1', video_description)
c5e8d7af
PH
880 video_description = clean_html(video_description)
881 else:
882 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
883 if fd_mobj:
884 video_description = unescapeHTML(fd_mobj.group(1))
885 else:
78caa52a 886 video_description = ''
c5e8d7af 887
f30a38be 888 def _extract_count(count_name):
46374a56 889 count = self._search_regex(
f30a38be
JMF
890 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
891 video_webpage, count_name, default=None)
336c3a69
JMF
892 if count is not None:
893 return int(count.replace(',', ''))
894 return None
69ea8ca4
PH
895 like_count = _extract_count('like')
896 dislike_count = _extract_count('dislike')
336c3a69 897
c5e8d7af 898 # subtitles
d82134c3 899 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 900
c5e8d7af 901 if self._downloader.params.get('listsubtitles', False):
d665f8d3 902 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
903 return
904
905 if 'length_seconds' not in video_info:
69ea8ca4 906 self._downloader.report_warning('unable to extract video duration')
b466b702 907 video_duration = None
c5e8d7af 908 else:
b466b702 909 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 910
1fb07d10
JG
911 # annotations
912 video_annotations = None
913 if self._downloader.params.get('writeannotations', False):
5f6a1245 914 video_annotations = self._extract_annotations(video_id)
1fb07d10 915
dd27fd17
PH
916 def _map_to_format_list(urlmap):
917 formats = []
918 for itag, video_real_url in urlmap.items():
919 dct = {
920 'format_id': itag,
921 'url': video_real_url,
922 'player_url': player_url,
923 }
0b65e5d4
PH
924 if itag in self._formats:
925 dct.update(self._formats[itag])
dd27fd17
PH
926 formats.append(dct)
927 return formats
928
c5e8d7af
PH
929 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
930 self.report_rtmp_download()
dd27fd17
PH
931 formats = [{
932 'format_id': '_rtmp',
933 'protocol': 'rtmp',
934 'url': video_info['conn'][0],
935 'player_url': player_url,
936 }]
00fe14fc 937 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
5f6a1245 938 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 939 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 940 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 941 url_map = {}
00fe14fc 942 for url_data_str in encoded_url_map.split(','):
c5e8d7af 943 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
944 if 'itag' not in url_data or 'url' not in url_data:
945 continue
946 format_id = url_data['itag'][0]
947 url = url_data['url'][0]
948
949 if 'sig' in url_data:
950 url += '&signature=' + url_data['sig'][0]
951 elif 's' in url_data:
952 encrypted_sig = url_data['s'][0]
953
954 if not age_gate:
955 jsplayer_url_json = self._search_regex(
956 r'"assets":.+?"js":\s*("[^"]+")',
78caa52a 957 video_webpage, 'JS player URL')
201e9eaa
PH
958 player_url = json.loads(jsplayer_url_json)
959 if player_url is None:
960 player_url_json = self._search_regex(
961 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 962 video_webpage, 'age gate player URL')
201e9eaa
PH
963 player_url = json.loads(player_url_json)
964
965 if self._downloader.params.get('verbose'):
cf010131 966 if player_url is None:
201e9eaa
PH
967 player_version = 'unknown'
968 player_desc = 'unknown'
969 else:
970 if player_url.endswith('swf'):
971 player_version = self._search_regex(
972 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 973 'flash player', fatal=False)
201e9eaa 974 player_desc = 'flash player %s' % player_version
cf010131 975 else:
201e9eaa
PH
976 player_version = self._search_regex(
977 r'html5player-([^/]+?)(?:/html5player)?\.js',
978 player_url,
979 'html5 player', fatal=False)
78caa52a 980 player_desc = 'html5 player %s' % player_version
201e9eaa 981
60064c53 982 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 983 self.to_screen('{%s} signature length %s, %s' %
9e1a5b84 984 (format_id, parts_sizes, player_desc))
201e9eaa
PH
985
986 signature = self._decrypt_signature(
987 encrypted_sig, video_id, player_url, age_gate)
988 url += '&signature=' + signature
989 if 'ratebypass' not in url:
990 url += '&ratebypass=yes'
991 url_map[format_id] = url
dd27fd17 992 formats = _map_to_format_list(url_map)
1d043b93
JMF
993 elif video_info.get('hlsvp'):
994 manifest_url = video_info['hlsvp'][0]
995 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 996 formats = _map_to_format_list(url_map)
c5e8d7af 997 else:
69ea8ca4 998 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 999
dd27fd17 1000 # Look for the DASH manifest
203fb43f 1001 if self._downloader.params.get('youtube_include_dash_manifest', True):
774e208f
PH
1002 dash_mpd = video_info.get('dashmpd')
1003 if not dash_mpd:
1004 self.report_warning('%s: DASH manifest missing' % video_id)
1005 else:
1006 dash_manifest_url = dash_mpd[0]
1007 try:
1008 dash_formats = self._parse_dash_manifest(
da276600 1009 video_id, dash_manifest_url, player_url, age_gate)
774e208f
PH
1010 except (ExtractorError, KeyError) as e:
1011 self.report_warning(
1012 'Skipping DASH manifest: %r' % e, video_id)
1013 else:
1014 formats.extend(dash_formats)
d80044c2 1015
4bcc7bd1 1016 self._sort_formats(formats)
4ea3be0a 1017
1018 return {
8bcc8756
JW
1019 'id': video_id,
1020 'uploader': video_uploader,
1021 'uploader_id': video_uploader_id,
1022 'upload_date': upload_date,
1023 'title': video_title,
1024 'thumbnail': video_thumbnail,
1025 'description': video_description,
1026 'categories': video_categories,
1027 'subtitles': video_subtitles,
1028 'duration': video_duration,
1029 'age_limit': 18 if age_gate else 0,
1030 'annotations': video_annotations,
7e8c0af0 1031 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 1032 'view_count': view_count,
4ea3be0a 1033 'like_count': like_count,
1034 'dislike_count': dislike_count,
8bcc8756 1035 'formats': formats,
4ea3be0a 1036 }
c5e8d7af 1037
5f6a1245 1038
880e1c52 1039class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
78caa52a 1040 IE_DESC = 'YouTube.com playlists'
d67cc9fa 1041 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1042 (?:https?://)?
1043 (?:\w+\.)?
1044 youtube\.com/
1045 (?:
ac7553d0 1046 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
c5e8d7af
PH
1047 \? (?:.*?&)*? (?:p|a|list)=
1048 | p/
1049 )
d67cc9fa 1050 (
7d568f5a 1051 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
5f6a1245 1052 # Top tracks, they can also include dots
d67cc9fa
JMF
1053 |(?:MC)[\w\.]*
1054 )
c5e8d7af
PH
1055 .*
1056 |
7d568f5a 1057 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1058 )"""
dbb94fb0 1059 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dcbb4580 1060 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
dbb94fb0 1061 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
78caa52a 1062 IE_NAME = 'youtube:playlist'
81127aa5
PH
1063 _TESTS = [{
1064 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1065 'info_dict': {
1066 'title': 'ytdl test PL',
a1cf99d0 1067 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
81127aa5
PH
1068 },
1069 'playlist_count': 3,
9291475f
PH
1070 }, {
1071 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1072 'info_dict': {
1073 'title': 'YDL_Empty_List',
1074 },
1075 'playlist_count': 0,
1076 }, {
1077 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1078 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1079 'info_dict': {
1080 'title': '29C3: Not my department',
1081 },
1082 'playlist_count': 95,
1083 }, {
1084 'note': 'issue #673',
1085 'url': 'PLBB231211A4F62143',
1086 'info_dict': {
f46a8702 1087 'title': '[OLD]Team Fortress 2 (Class-based LP)',
9291475f
PH
1088 },
1089 'playlist_mincount': 26,
1090 }, {
1091 'note': 'Large playlist',
1092 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1093 'info_dict': {
1094 'title': 'Uploads from Cauchemar',
1095 },
1096 'playlist_mincount': 799,
1097 }, {
1098 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1099 'info_dict': {
1100 'title': 'YDL_safe_search',
1101 },
1102 'playlist_count': 2,
ac7553d0
PH
1103 }, {
1104 'note': 'embedded',
1105 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1106 'playlist_count': 4,
1107 'info_dict': {
1108 'title': 'JODA15',
1109 }
6b08cdf6
PH
1110 }, {
1111 'note': 'Embedded SWF player',
1112 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1113 'playlist_count': 4,
1114 'info_dict': {
1115 'title': 'JODA7',
1116 }
81127aa5 1117 }]
c5e8d7af 1118
880e1c52
JMF
1119 def _real_initialize(self):
1120 self._login()
1121
652cdaa2 1122 def _ids_to_results(self, ids):
c9cc0bf5
PH
1123 return [
1124 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1125 for vid_id in ids]
652cdaa2
JMF
1126
1127 def _extract_mix(self, playlist_id):
1128 # The mixes are generated from a a single video
1129 # the id of the playlist is just 'RD' + video_id
7d4afc55 1130 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
c9cc0bf5 1131 webpage = self._download_webpage(
78caa52a 1132 url, playlist_id, 'Downloading Youtube mix')
bc2f773b 1133 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
1134 title_span = (
1135 search_title('playlist-title') or
1136 search_title('title long-title') or
1137 search_title('title'))
76d1700b 1138 title = clean_html(title_span)
c9cc0bf5
PH
1139 ids = orderedSet(re.findall(
1140 r'''(?xs)data-video-username=".*?".*?
1141 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1142 webpage))
652cdaa2
JMF
1143 url_results = self._ids_to_results(ids)
1144
1145 return self.playlist_result(url_results, playlist_id, title)
1146
c5e8d7af
PH
1147 def _real_extract(self, url):
1148 # Extract playlist id
d67cc9fa 1149 mobj = re.match(self._VALID_URL, url)
c5e8d7af 1150 if mobj is None:
69ea8ca4 1151 raise ExtractorError('Invalid URL: %s' % url)
47192f92
FV
1152 playlist_id = mobj.group(1) or mobj.group(2)
1153
1154 # Check if it's a video-specific URL
7c61bd36 1155 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1156 if 'v' in query_dict:
1157 video_id = query_dict['v'][0]
1158 if self._downloader.params.get('noplaylist'):
69ea8ca4 1159 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1160 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92 1161 else:
69ea8ca4 1162 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1163
7d4afc55 1164 if playlist_id.startswith('RD'):
652cdaa2
JMF
1165 # Mixes require a custom extraction process
1166 return self._extract_mix(playlist_id)
0a688bc0 1167 if playlist_id.startswith('TL'):
69ea8ca4 1168 raise ExtractorError('For downloading YouTube.com top lists, use '
8bcc8756 1169 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1170
dbb94fb0
S
1171 url = self._TEMPLATE_URL % playlist_id
1172 page = self._download_webpage(url, playlist_id)
1173 more_widget_html = content_html = page
1174
10c0e2d8 1175 # Check if the playlist exists or is private
e399853d 1176 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
10c0e2d8 1177 raise ExtractorError(
78caa52a 1178 'The playlist doesn\'t exist or is private, use --username or '
10c0e2d8
JMF
1179 '--netrc to access it.',
1180 expected=True)
1181
dcbb4580
JMF
1182 # Extract the video ids from the playlist pages
1183 ids = []
c5e8d7af 1184
755eb032 1185 for page_num in itertools.count(1):
dbb94fb0 1186 matches = re.finditer(self._VIDEO_RE, content_html)
6e47b51e
JMF
1187 # We remove the duplicates and the link with index 0
1188 # (it's not the first video of the playlist)
1189 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1190 ids.extend(new_ids)
c5e8d7af 1191
dbb94fb0
S
1192 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1193 if not mobj:
c5e8d7af
PH
1194 break
1195
dbb94fb0 1196 more = self._download_json(
5912c639
PH
1197 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1198 'Downloading page #%s' % page_num,
1199 transform_source=uppercase_escape)
dbb94fb0
S
1200 content_html = more['content_html']
1201 more_widget_html = more['load_more_widget_html']
1202
1203 playlist_title = self._html_search_regex(
68eb8e90 1204 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
78caa52a 1205 page, 'title')
c5e8d7af 1206
652cdaa2 1207 url_results = self._ids_to_results(ids)
dcbb4580 1208 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1209
1210
0a688bc0 1211class YoutubeTopListIE(YoutubePlaylistIE):
78caa52a 1212 IE_NAME = 'youtube:toplist'
69ea8ca4 1213 IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"'
9e1a5b84 1214 ' (Example: "yttoplist:music:Top Tracks")')
0a688bc0 1215 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
cdc628a4
PH
1216 _TESTS = [{
1217 'url': 'yttoplist:music:Trending',
1218 'playlist_mincount': 5,
1219 'skip': 'Only works for logged-in users',
1220 }]
0a688bc0
JMF
1221
1222 def _real_extract(self, url):
1223 mobj = re.match(self._VALID_URL, url)
1224 channel = mobj.group('chann')
1225 title = mobj.group('title')
1226 query = compat_urllib_parse.urlencode({'title': title})
cdc628a4
PH
1227 channel_page = self._download_webpage(
1228 'https://www.youtube.com/%s' % channel, title)
1229 link = self._html_search_regex(
1230 r'''(?x)
1231 <a\s+href="([^"]+)".*?>\s*
1232 <span\s+class="branded-page-module-title-text">\s*
1233 <span[^>]*>.*?%s.*?</span>''' % re.escape(query),
1234 channel_page, 'list')
0a688bc0 1235 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
5f6a1245 1236
0a688bc0
JMF
1237 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1238 ids = []
1239 # sometimes the webpage doesn't contain the videos
1240 # retry until we get them
1241 for i in itertools.count(0):
78caa52a 1242 msg = 'Downloading Youtube mix'
0a688bc0
JMF
1243 if i > 0:
1244 msg += ', retry #%d' % i
c9cc0bf5 1245
0a688bc0
JMF
1246 webpage = self._download_webpage(url, title, msg)
1247 ids = orderedSet(re.findall(video_re, webpage))
1248 if ids:
1249 break
1250 url_results = self._ids_to_results(ids)
1251 return self.playlist_result(url_results, playlist_title=title)
1252
1253
c5e8d7af 1254class YoutubeChannelIE(InfoExtractor):
78caa52a 1255 IE_DESC = 'YouTube.com channels'
9ff67727 1256 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
c5e8d7af 1257 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
38c2e5b8 1258 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
78caa52a 1259 IE_NAME = 'youtube:channel'
cdc628a4
PH
1260 _TESTS = [{
1261 'note': 'paginated channel',
1262 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1263 'playlist_mincount': 91,
1264 }]
c5e8d7af
PH
1265
1266 def extract_videos_from_page(self, page):
1267 ids_in_page = []
1268 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1269 if mobj.group(1) not in ids_in_page:
1270 ids_in_page.append(mobj.group(1))
1271 return ids_in_page
1272
1273 def _real_extract(self, url):
9ff67727 1274 channel_id = self._match_id(url)
c5e8d7af 1275
c5e8d7af 1276 video_ids = []
b9643eed
JMF
1277 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1278 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1279 autogenerated = re.search(r'''(?x)
1280 class="[^"]*?(?:
1281 channel-header-autogenerated-label|
1282 yt-channel-title-autogenerated
1283 )[^"]*"''', channel_page) is not None
c5e8d7af 1284
b9643eed
JMF
1285 if autogenerated:
1286 # The videos are contained in a single page
1287 # the ajax pages can't be used, they are empty
1288 video_ids = self.extract_videos_from_page(channel_page)
b82f815f
PH
1289 entries = [
1290 self.url_result(video_id, 'Youtube', video_id=video_id)
1291 for video_id in video_ids]
1292 return self.playlist_result(entries, channel_id)
1293
1294 def _entries():
b9643eed
JMF
1295 for pagenum in itertools.count(1):
1296 url = self._MORE_PAGES_URL % (pagenum, channel_id)
81c2f20b 1297 page = self._download_json(
69ea8ca4 1298 url, channel_id, note='Downloading page #%s' % pagenum,
81c2f20b
PH
1299 transform_source=uppercase_escape)
1300
b9643eed 1301 ids_in_page = self.extract_videos_from_page(page['content_html'])
b82f815f
PH
1302 for video_id in ids_in_page:
1303 yield self.url_result(
1304 video_id, 'Youtube', video_id=video_id)
5f6a1245 1305
b9643eed
JMF
1306 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1307 break
c5e8d7af 1308
b82f815f 1309 return self.playlist_result(_entries(), channel_id)
c5e8d7af
PH
1310
1311
1312class YoutubeUserIE(InfoExtractor):
78caa52a 1313 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
9ff67727 1314 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
38c2e5b8 1315 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
c5e8d7af 1316 _GDATA_PAGE_SIZE = 50
38c2e5b8 1317 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
78caa52a 1318 IE_NAME = 'youtube:user'
c5e8d7af 1319
cdc628a4
PH
1320 _TESTS = [{
1321 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1322 'playlist_mincount': 320,
1323 'info_dict': {
1324 'title': 'TheLinuxFoundation',
1325 }
1326 }, {
1327 'url': 'ytuser:phihag',
1328 'only_matching': True,
1329 }]
1330
e3ea4790 1331 @classmethod
f4b05232 1332 def suitable(cls, url):
e3ea4790
JMF
1333 # Don't return True if the url can be extracted with other youtube
1334 # extractor, the regex would is too permissive and it would match.
1335 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
5f6a1245
JW
1336 if any(ie.suitable(url) for ie in other_ies):
1337 return False
1338 else:
1339 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 1340
c5e8d7af 1341 def _real_extract(self, url):
9ff67727 1342 username = self._match_id(url)
c5e8d7af
PH
1343
1344 # Download video ids using YouTube Data API. Result size per
1345 # query is limited (currently to 50 videos) so we need to query
1346 # page by page until there are no video ids - it means we got
1347 # all of them.
1348
b7ab0590 1349 def download_page(pagenum):
c5e8d7af
PH
1350 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1351
1352 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1353 page = self._download_webpage(
1354 gdata_url, username,
78caa52a 1355 'Downloading video ids from %d to %d' % (
b7ab0590 1356 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1357
fd9cf738
JMF
1358 try:
1359 response = json.loads(page)
1360 except ValueError as err:
69ea8ca4 1361 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
71c82637 1362 if 'entry' not in response['feed']:
b7ab0590 1363 return
fd9cf738 1364
c5e8d7af 1365 # Extract video identifiers
e302f9ce
PH
1366 entries = response['feed']['entry']
1367 for entry in entries:
1368 title = entry['title']['$t']
1369 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1370 yield {
e302f9ce
PH
1371 '_type': 'url',
1372 'url': video_id,
1373 'ie_key': 'Youtube',
b11cec41 1374 'id': video_id,
e302f9ce 1375 'title': title,
b7ab0590 1376 }
9c44d242 1377 url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1378
7012b23c
PH
1379 return self.playlist_result(url_results, playlist_title=username)
1380
b05654f0
PH
1381
1382class YoutubeSearchIE(SearchInfoExtractor):
78caa52a
PH
1383 IE_DESC = 'YouTube.com searches'
1384 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
b05654f0 1385 _MAX_RESULTS = 1000
78caa52a 1386 IE_NAME = 'youtube:search'
b05654f0
PH
1387 _SEARCH_KEY = 'ytsearch'
1388
b05654f0
PH
1389 def _get_n_results(self, query, n):
1390 """Get a specified number of results for a query"""
1391
1392 video_ids = []
1393 pagenum = 0
1394 limit = n
83d548ef 1395 PAGE_SIZE = 50
b05654f0 1396
83d548ef
PH
1397 while (PAGE_SIZE * pagenum) < limit:
1398 result_url = self._API_URL % (
1399 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1400 (PAGE_SIZE * pagenum) + 1)
7cc3570e 1401 data_json = self._download_webpage(
69ea8ca4
PH
1402 result_url, video_id='query "%s"' % query,
1403 note='Downloading page %s' % (pagenum + 1),
1404 errnote='Unable to download API page')
7cc3570e
PH
1405 data = json.loads(data_json)
1406 api_response = data['data']
1407
1408 if 'items' not in api_response:
07ad22b8 1409 raise ExtractorError(
78caa52a 1410 '[youtube] No video results', expected=True)
b05654f0
PH
1411
1412 new_ids = list(video['id'] for video in api_response['items'])
1413 video_ids += new_ids
1414
1415 limit = min(n, api_response['totalItems'])
1416 pagenum += 1
1417
1418 if len(video_ids) > n:
1419 video_ids = video_ids[:n]
7012b23c
PH
1420 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1421 for video_id in video_ids]
b05654f0 1422 return self.playlist_result(videos, query)
75dff0ee 1423
c9ae7b95 1424
a3dd9248 1425class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1426 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1427 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1428 _SEARCH_KEY = 'ytsearchdate'
78caa52a 1429 IE_DESC = 'YouTube.com searches, newest videos first'
75dff0ee 1430
c9ae7b95
PH
1431
1432class YoutubeSearchURLIE(InfoExtractor):
78caa52a
PH
1433 IE_DESC = 'YouTube.com search URLs'
1434 IE_NAME = 'youtube:search_url'
c9ae7b95 1435 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
1436 _TESTS = [{
1437 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1438 'playlist_mincount': 5,
1439 'info_dict': {
1440 'title': 'youtube-dl test video',
1441 }
1442 }]
c9ae7b95
PH
1443
1444 def _real_extract(self, url):
1445 mobj = re.match(self._VALID_URL, url)
1446 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1447
1448 webpage = self._download_webpage(url, query)
1449 result_code = self._search_regex(
78caa52a 1450 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
c9ae7b95
PH
1451
1452 part_codes = re.findall(
1453 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1454 entries = []
1455 for part_code in part_codes:
1456 part_title = self._html_search_regex(
6feb2d5e 1457 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1458 part_url_snippet = self._html_search_regex(
1459 r'(?s)href="([^"]+)"', part_code, 'item URL')
1460 part_url = compat_urlparse.urljoin(
1461 'https://www.youtube.com/', part_url_snippet)
1462 entries.append({
1463 '_type': 'url',
1464 'url': part_url,
1465 'title': part_title,
1466 })
1467
1468 return {
1469 '_type': 'playlist',
1470 'entries': entries,
1471 'title': query,
1472 }
1473
1474
75dff0ee 1475class YoutubeShowIE(InfoExtractor):
78caa52a 1476 IE_DESC = 'YouTube.com (multi-season) shows'
cdc628a4 1477 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 1478 IE_NAME = 'youtube:show'
cdc628a4
PH
1479 _TESTS = [{
1480 'url': 'http://www.youtube.com/show/airdisasters',
1481 'playlist_mincount': 3,
1482 'info_dict': {
1483 'id': 'airdisasters',
1484 'title': 'Air Disasters',
1485 }
1486 }]
75dff0ee
JMF
1487
1488 def _real_extract(self, url):
1489 mobj = re.match(self._VALID_URL, url)
cdc628a4
PH
1490 playlist_id = mobj.group('id')
1491 webpage = self._download_webpage(
1492 url, playlist_id, 'Downloading show webpage')
75dff0ee
JMF
1493 # There's one playlist for each season of the show
1494 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
cdc628a4
PH
1495 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1496 entries = [
1497 self.url_result(
1498 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1499 for season in m_seasons
1500 ]
1501 title = self._og_search_title(webpage, fatal=False)
1502
1503 return {
1504 '_type': 'playlist',
1505 'id': playlist_id,
1506 'title': title,
1507 'entries': entries,
1508 }
04cc9617
JMF
1509
1510
b2e8bc1b 1511class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1512 """
1513 Base class for extractors that fetch info from
1514 http://www.youtube.com/feed_ajax
1515 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1516 """
b2e8bc1b 1517 _LOGIN_REQUIRED = True
43ba5456
JMF
1518 # use action_load_personal_feed instead of action_load_system_feed
1519 _PERSONAL_FEED = False
04cc9617 1520
d7ae0639
JMF
1521 @property
1522 def _FEED_TEMPLATE(self):
43ba5456
JMF
1523 action = 'action_load_system_feed'
1524 if self._PERSONAL_FEED:
1525 action = 'action_load_personal_feed'
38c2e5b8 1526 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1527
1528 @property
1529 def IE_NAME(self):
78caa52a 1530 return 'youtube:%s' % self._FEED_NAME
04cc9617 1531
81f0259b 1532 def _real_initialize(self):
b2e8bc1b 1533 self._login()
81f0259b 1534
04cc9617
JMF
1535 def _real_extract(self, url):
1536 feed_entries = []
0e44d838
JMF
1537 paging = 0
1538 for i in itertools.count(1):
f6177462 1539 info = self._download_json(self._FEED_TEMPLATE % paging,
8bcc8756
JW
1540 '%s feed' % self._FEED_NAME,
1541 'Downloading page %s' % i)
f6177462 1542 feed_html = info.get('feed_html') or info.get('content_html')
1a9b9649 1543 load_more_widget_html = info.get('load_more_widget_html') or feed_html
43ba5456 1544 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1545 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1546 feed_entries.extend(
1547 self.url_result(video_id, 'Youtube', video_id=video_id)
1548 for video_id in ids)
05ee2b6d
JMF
1549 mobj = re.search(
1550 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1a9b9649 1551 load_more_widget_html)
05ee2b6d 1552 if mobj is None:
04cc9617 1553 break
05ee2b6d 1554 paging = mobj.group('paging')
d7ae0639
JMF
1555 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1556
5f6a1245 1557
d7ae0639 1558class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
f3a34072 1559 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
d7ae0639
JMF
1560 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1561 _FEED_NAME = 'recommended'
78caa52a 1562 _PLAYLIST_TITLE = 'Youtube Recommended videos'
c626a3d9 1563
5f6a1245 1564
43ba5456 1565class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
f3a34072 1566 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
43ba5456
JMF
1567 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1568 _FEED_NAME = 'watch_later'
78caa52a 1569 _PLAYLIST_TITLE = 'Youtube Watch Later'
43ba5456 1570 _PERSONAL_FEED = True
c626a3d9 1571
5f6a1245 1572
f459d170 1573class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
f3a34072 1574 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
78caa52a 1575 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
f459d170
JMF
1576 _FEED_NAME = 'history'
1577 _PERSONAL_FEED = True
78caa52a 1578 _PLAYLIST_TITLE = 'Youtube Watch History'
f459d170 1579
5f6a1245 1580
c626a3d9 1581class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 1582 IE_NAME = 'youtube:favorites'
f3a34072 1583 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
c7a7750d 1584 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1585 _LOGIN_REQUIRED = True
1586
1587 def _real_extract(self, url):
1588 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 1589 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 1590 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1591
1592
1ed5b5c9 1593class YoutubeSubscriptionsIE(YoutubePlaylistIE):
78caa52a
PH
1594 IE_NAME = 'youtube:subscriptions'
1595 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1ed5b5c9 1596 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
81127aa5 1597 _TESTS = []
1ed5b5c9
JMF
1598
1599 def _real_extract(self, url):
78caa52a 1600 title = 'Youtube Subscriptions'
1ed5b5c9
JMF
1601 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1602
1603 # The extraction process is the same as for playlists, but the regex
1604 # for the video ids doesn't contain an index
1605 ids = []
1606 more_widget_html = content_html = page
1607
1608 for page_num in itertools.count(1):
1609 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1610 new_ids = orderedSet(matches)
1611 ids.extend(new_ids)
1612
1613 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1614 if not mobj:
1615 break
1616
1617 more = self._download_json(
1618 'https://youtube.com/%s' % mobj.group('more'), title,
1619 'Downloading page #%s' % page_num,
1620 transform_source=uppercase_escape)
1621 content_html = more['content_html']
1622 more_widget_html = more['load_more_widget_html']
1623
1624 return {
1625 '_type': 'playlist',
1626 'title': title,
1627 'entries': self._ids_to_results(ids),
1628 }
1629
1630
15870e90
PH
1631class YoutubeTruncatedURLIE(InfoExtractor):
1632 IE_NAME = 'youtube:truncated_url'
1633 IE_DESC = False # Do not list
975d35db 1634 _VALID_URL = r'''(?x)
c4808c60
PH
1635 (?:https?://)?[^/]+/watch\?(?:
1636 feature=[a-z_]+|
1637 annotation_id=annotation_[^&]+
1638 )?$|
975d35db
PH
1639 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1640 '''
15870e90 1641
c4808c60
PH
1642 _TESTS = [{
1643 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1644 'only_matching': True,
dc2fc736
PH
1645 }, {
1646 'url': 'http://www.youtube.com/watch?',
1647 'only_matching': True,
c4808c60
PH
1648 }]
1649
15870e90
PH
1650 def _real_extract(self, url):
1651 raise ExtractorError(
78caa52a
PH
1652 'Did you forget to quote the URL? Remember that & is a meta '
1653 'character in most shells, so you want to put the URL in quotes, '
1654 'like youtube-dl '
1655 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1656 ' or simply youtube-dl BaW_jenozKc .',
15870e90 1657 expected=True)