]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[youtube] Make further DASH manifests not fatal after succeeded one
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
42939b61 10import time
e0df6211 11import traceback
c5e8d7af 12
b05654f0 13from .common import InfoExtractor, SearchInfoExtractor
2b25cb5d 14from ..jsinterp import JSInterpreter
54256267 15from ..swfinterp import SWFInterpreter
4bb4a188 16from ..compat import (
edf3e38e 17 compat_chr,
c5e8d7af 18 compat_parse_qs,
c5e8d7af
PH
19 compat_urllib_parse,
20 compat_urllib_request,
7c61bd36 21 compat_urlparse,
c5e8d7af 22 compat_str,
4bb4a188
PH
23)
24from ..utils import (
c5e8d7af 25 clean_html,
c5e8d7af 26 ExtractorError,
2d30521a 27 float_or_none,
4bb4a188
PH
28 get_element_by_attribute,
29 get_element_by_id,
dd27fd17 30 int_or_none,
4bb4a188 31 orderedSet,
c93d53f5 32 str_to_int,
c5e8d7af
PH
33 unescapeHTML,
34 unified_strdate,
81c2f20b 35 uppercase_escape,
af214c3a 36 ISO3166Utils,
c5e8d7af
PH
37)
38
5f6a1245 39
de7f3446 40class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
41 """Provide base functions for Youtube extractors"""
42 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
83317f69 43 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
b2e8bc1b
JMF
44 _NETRC_MACHINE = 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED = False
47
b2e8bc1b 48 def _set_language(self):
810fb84d
PH
49 self._set_cookie(
50 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
42939b61 51 # YouTube sets the expire time to about two months
810fb84d 52 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b 53
25f14e9f
S
54 def _ids_to_results(self, ids):
55 return [
56 self.url_result(vid_id, 'Youtube', video_id=vid_id)
57 for vid_id in ids]
58
b2e8bc1b 59 def _login(self):
83317f69 60 """
61 Attempt to log in to YouTube.
62 True is returned if successful or skipped.
63 False is returned if login failed.
64
65 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
66 """
b2e8bc1b
JMF
67 (username, password) = self._get_login_info()
68 # No authentication to be performed
69 if username is None:
70 if self._LOGIN_REQUIRED:
69ea8ca4 71 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 72 return True
b2e8bc1b 73
7cc3570e
PH
74 login_page = self._download_webpage(
75 self._LOGIN_URL, None,
69ea8ca4
PH
76 note='Downloading login page',
77 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
78 if login_page is False:
79 return
b2e8bc1b 80
795f28f8 81 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78caa52a 82 login_page, 'Login GALX parameter')
c5e8d7af 83
b2e8bc1b
JMF
84 # Log in
85 login_form_strs = {
8bcc8756
JW
86 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
87 'Email': username,
88 'GALX': galx,
89 'Passwd': password,
90
91 'PersistentCookie': 'yes',
92 '_utf8': '霱',
93 'bgresponse': 'js_disabled',
94 'checkConnection': '',
95 'checkedDomains': 'youtube',
96 'dnConn': '',
97 'pstMsg': '0',
98 'rmShown': '1',
99 'secTok': '',
100 'signIn': 'Sign in',
101 'timeStmp': '',
102 'service': 'youtube',
103 'uilel': '3',
104 'hl': 'en_US',
b2e8bc1b 105 }
83317f69 106
b2e8bc1b
JMF
107 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
108 # chokes on unicode
5f6a1245 109 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
b2e8bc1b 110 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
111
112 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
113 login_results = self._download_webpage(
114 req, None,
69ea8ca4 115 note='Logging in', errnote='unable to log in', fatal=False)
7cc3570e
PH
116 if login_results is False:
117 return False
83317f69 118
119 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
69ea8ca4 120 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
83317f69 121
122 # Two-Factor
123 # TODO add SMS and phone call support - these require making a request and then prompting the user
124
125 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
126 tfa_code = self._get_tfa_info()
127
128 if tfa_code is None:
69ea8ca4
PH
129 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
130 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
83317f69 131 return False
132
133 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
134
135 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
136 if match is None:
69ea8ca4 137 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
83317f69 138 secTok = match.group(1)
139 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
140 if match is None:
69ea8ca4 141 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
83317f69 142 timeStmp = match.group(1)
143
144 tfa_form_strs = {
78caa52a
PH
145 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
146 'smsToken': '',
147 'smsUserPin': tfa_code,
148 'smsVerifyPin': 'Verify',
149
150 'PersistentCookie': 'yes',
151 'checkConnection': '',
152 'checkedDomains': 'youtube',
153 'pstMsg': '1',
154 'secTok': secTok,
155 'timeStmp': timeStmp,
156 'service': 'youtube',
157 'hl': 'en_US',
83317f69 158 }
5f6a1245 159 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
83317f69 160 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
161
162 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
163 tfa_results = self._download_webpage(
164 tfa_req, None,
69ea8ca4 165 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
83317f69 166
167 if tfa_results is False:
168 return False
169
170 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
69ea8ca4 171 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
83317f69 172 return False
173 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
69ea8ca4 174 self._downloader.report_warning('unable to log in - did the page structure change?')
83317f69 175 return False
176 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
69ea8ca4 177 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
83317f69 178 return False
179
7cc3570e 180 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
69ea8ca4 181 self._downloader.report_warning('unable to log in: bad username or password')
b2e8bc1b
JMF
182 return False
183 return True
184
b2e8bc1b
JMF
185 def _real_initialize(self):
186 if self._downloader is None:
187 return
42939b61 188 self._set_language()
b2e8bc1b
JMF
189 if not self._login():
190 return
c5e8d7af 191
8377574c 192
360e1ca5 193class YoutubeIE(YoutubeBaseInfoExtractor):
78caa52a 194 IE_DESC = 'YouTube.com'
cb7dfeea 195 _VALID_URL = r"""(?x)^
c5e8d7af 196 (
edb53e2d 197 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 198 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 199 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 200 (?:www\.)?pwnyoutube\.com/|
f7000f3a 201 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
202 tube\.majestyc\.net/|
203 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
204 (?:.*?\#/)? # handle anchor (#/) redirect urls
205 (?: # the various things that can precede the ID:
ac7553d0 206 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 207 |(?: # or the v= param in all its forms
f7000f3a 208 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
209 (?:\?|\#!?) # the params delimiter ? or # or #!
210 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
211 v=
212 )
f4b05232
JMF
213 ))
214 |youtu\.be/ # just youtu.be/xxxx
edb53e2d 215 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 216 )
c5e8d7af 217 )? # all until now is optional -> you can pass the naked ID
8963d9c2 218 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
9291475f 219 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
c5e8d7af
PH
220 (?(1).+)? # if we found the ID, everything can follow
221 $"""
c5e8d7af 222 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
223 _formats = {
224 '5': {'ext': 'flv', 'width': 400, 'height': 240},
225 '6': {'ext': 'flv', 'width': 450, 'height': 270},
226 '13': {'ext': '3gp'},
227 '17': {'ext': '3gp', 'width': 176, 'height': 144},
228 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
229 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
230 '34': {'ext': 'flv', 'width': 640, 'height': 360},
231 '35': {'ext': 'flv', 'width': 854, 'height': 480},
232 '36': {'ext': '3gp', 'width': 320, 'height': 240},
233 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
234 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
235 '43': {'ext': 'webm', 'width': 640, 'height': 360},
236 '44': {'ext': 'webm', 'width': 854, 'height': 480},
237 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
238 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
c9bebed2
S
239 '59': {'ext': 'mp4', 'width': 854, 'height': 480},
240 '78': {'ext': 'mp4', 'width': 854, 'height': 480},
2c62dc26 241
1d043b93 242
86fe61c8 243 # 3d videos
43b81eb9
PH
244 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
245 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
246 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
247 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
248 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
249 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
250 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 251
96fb5605 252 # Apple HTTP Live Streaming
43b81eb9
PH
253 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
254 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
255 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
256 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
257 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
258 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
259 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
260
261 # DASH mp4 video
43b81eb9
PH
262 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
263 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
264 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
265 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
266 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
e65566a9 267 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
43b81eb9
PH
268 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
269 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
0d2c1418
PH
270 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
271 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
272 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
836a086c 273
f6f1fc92 274 # Dash mp4 audio
62cd676c
PH
275 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
276 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
277 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
836a086c
AZ
278
279 # Dash webm
e75cafe9
A
280 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
281 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
282 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
283 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
284 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
285 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
1cc887cb 286 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
e75cafe9
A
287 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
288 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
289 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
290 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
291 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
292 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
293 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 294 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 295 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
a51d3aa0
PH
296 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
297 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
76b3c610 298 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
10a404c3 299 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
76b3c610 300 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
2c62dc26
PH
301
302 # Dash webm audio
55db73ef 303 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
e75cafe9 304 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d 305
0857baad
PH
306 # Dash webm audio with opus inside
307 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
308 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
309 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
310
ce6b9a2d
PH
311 # RTMP (unnamed)
312 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 313 }
836a086c 314
78caa52a 315 IE_NAME = 'youtube'
2eb88d95
PH
316 _TESTS = [
317 {
4bc3a23e
PH
318 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
319 'info_dict': {
320 'id': 'BaW_jenozKc',
321 'ext': 'mp4',
322 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
323 'uploader': 'Philipp Hagemeister',
324 'uploader_id': 'phihag',
325 'upload_date': '20121002',
326 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
327 'categories': ['Science & Technology'],
3e7c1224
PH
328 'like_count': int,
329 'dislike_count': int,
2eb88d95 330 }
0e853ca4 331 },
0e853ca4 332 {
4bc3a23e
PH
333 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
334 'note': 'Test generic use_cipher_signature video (#897)',
335 'info_dict': {
336 'id': 'UxxajLWwzqY',
337 'ext': 'mp4',
338 'upload_date': '20120506',
339 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
340 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
341 'uploader': 'Icona Pop',
342 'uploader_id': 'IconaPop',
2eb88d95 343 }
c108eb73
JMF
344 },
345 {
4bc3a23e
PH
346 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
347 'note': 'Test VEVO video with age protection (#956)',
348 'info_dict': {
349 'id': '07FYdnEawAQ',
350 'ext': 'mp4',
351 'upload_date': '20130703',
352 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
353 'description': 'md5:64249768eec3bc4276236606ea996373',
354 'uploader': 'justintimberlakeVEVO',
355 'uploader_id': 'justintimberlakeVEVO',
c108eb73
JMF
356 }
357 },
fccd3771 358 {
4bc3a23e
PH
359 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
360 'note': 'Embed-only video (#1746)',
361 'info_dict': {
362 'id': 'yZIXLfi8CZQ',
363 'ext': 'mp4',
364 'upload_date': '20120608',
365 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
366 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
367 'uploader': 'SET India',
368 'uploader_id': 'setindia'
fccd3771
PH
369 }
370 },
dd27fd17 371 {
4bc3a23e
PH
372 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
373 'note': '256k DASH audio (format 141) via DASH manifest',
374 'info_dict': {
375 'id': 'a9LDPn-MO4I',
376 'ext': 'm4a',
377 'upload_date': '20121002',
378 'uploader_id': '8KVIDEO',
379 'description': '',
380 'uploader': '8KVIDEO',
381 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 382 },
4bc3a23e
PH
383 'params': {
384 'youtube_include_dash_manifest': True,
385 'format': '141',
4919603f 386 },
dd27fd17 387 },
3489b7d2
JMF
388 # DASH manifest with encrypted signature
389 {
78caa52a
PH
390 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
391 'info_dict': {
392 'id': 'IB3lcPjvWLA',
393 'ext': 'm4a',
b766eb27
JMF
394 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
395 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
78caa52a
PH
396 'uploader': 'AfrojackVEVO',
397 'uploader_id': 'AfrojackVEVO',
398 'upload_date': '20131011',
3489b7d2 399 },
4bc3a23e 400 'params': {
78caa52a
PH
401 'youtube_include_dash_manifest': True,
402 'format': '141',
3489b7d2
JMF
403 },
404 },
aaeb86f6
S
405 # JS player signature function name containing $
406 {
407 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
408 'info_dict': {
409 'id': 'nfWlot6h_JM',
410 'ext': 'm4a',
411 'title': 'Taylor Swift - Shake It Off',
412 'description': 'md5:2acfda1b285bdd478ccec22f9918199d',
413 'uploader': 'TaylorSwiftVEVO',
414 'uploader_id': 'TaylorSwiftVEVO',
415 'upload_date': '20140818',
416 },
417 'params': {
418 'youtube_include_dash_manifest': True,
419 'format': '141',
420 },
421 },
aa79ac0c
PH
422 # Controversy video
423 {
424 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
425 'info_dict': {
426 'id': 'T4XJQO3qol8',
427 'ext': 'mp4',
428 'upload_date': '20100909',
429 'uploader': 'The Amazing Atheist',
430 'uploader_id': 'TheAmazingAtheist',
431 'title': 'Burning Everyone\'s Koran',
432 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
433 }
c522adb1
JMF
434 },
435 # Normal age-gate video (No vevo, embed allowed)
436 {
437 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
438 'info_dict': {
439 'id': 'HtVdAasjOgU',
440 'ext': 'mp4',
441 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
9ed99402 442 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
c522adb1
JMF
443 'uploader': 'The Witcher',
444 'uploader_id': 'WitcherGame',
445 'upload_date': '20140605',
446 },
447 },
fccae2b9
S
448 # Age-gate video with encrypted signature
449 {
450 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',
451 'info_dict': {
452 'id': '6kLq3WMV1nU',
453 'ext': 'mp4',
454 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
455 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
456 'uploader': 'LloydVEVO',
457 'uploader_id': 'LloydVEVO',
458 'upload_date': '20110629',
459 },
460 },
774e208f
PH
461 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
462 {
463 'url': '__2ABJjxzNo',
464 'info_dict': {
465 'id': '__2ABJjxzNo',
466 'ext': 'mp4',
467 'upload_date': '20100430',
468 'uploader_id': 'deadmau5',
469 'description': 'md5:12c56784b8032162bb936a5f76d55360',
470 'uploader': 'deadmau5',
471 'title': 'Deadmau5 - Some Chords (HD)',
472 },
473 'expected_warnings': [
474 'DASH manifest missing',
475 ]
e52a40ab
PH
476 },
477 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
478 {
479 'url': 'lqQg6PlCWgI',
480 'info_dict': {
481 'id': 'lqQg6PlCWgI',
482 'ext': 'mp4',
cbe2bd91
PH
483 'upload_date': '20120731',
484 'uploader_id': 'olympic',
485 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
486 'uploader': 'Olympics',
487 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
488 },
489 'params': {
490 'skip_download': 'requires avconv',
e52a40ab 491 }
cbe2bd91 492 },
6271f1ca
PH
493 # Non-square pixels
494 {
495 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
496 'info_dict': {
497 'id': '_b-2C3KPAM0',
498 'ext': 'mp4',
499 'stretched_ratio': 16 / 9.,
500 'upload_date': '20110310',
501 'uploader_id': 'AllenMeow',
502 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
503 'uploader': '孫艾倫',
504 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
505 },
06b491eb
S
506 },
507 # url_encoded_fmt_stream_map is empty string
508 {
509 'url': 'qEJwOuvDf7I',
510 'info_dict': {
511 'id': 'qEJwOuvDf7I',
512 'ext': 'mp4',
513 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
514 'description': '',
515 'upload_date': '20150404',
516 'uploader_id': 'spbelect',
517 'uploader': 'Наблюдатели Петербурга',
518 },
519 'params': {
520 'skip_download': 'requires avconv',
521 }
522 },
da77d856
S
523 # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)
524 {
525 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
526 'info_dict': {
527 'id': 'FIl7x6_3R5Y',
528 'ext': 'mp4',
529 'title': 'md5:7b81415841e02ecd4313668cde88737a',
530 'description': 'md5:116377fd2963b81ec4ce64b542173306',
531 'upload_date': '20150625',
532 'uploader_id': 'dorappi2000',
533 'uploader': 'dorappi2000',
534 'formats': 'mincount:33',
535 },
536 }
2eb88d95
PH
537 ]
538
e0df6211
PH
539 def __init__(self, *args, **kwargs):
540 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 541 self._player_cache = {}
e0df6211 542
c5e8d7af
PH
543 def report_video_info_webpage_download(self, video_id):
544 """Report attempt to download video info webpage."""
69ea8ca4 545 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 546
c5e8d7af
PH
547 def report_information_extraction(self, video_id):
548 """Report attempt to extract video information."""
69ea8ca4 549 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
550
551 def report_unavailable_format(self, video_id, format):
552 """Report extracted video URL."""
69ea8ca4 553 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
554
555 def report_rtmp_download(self):
556 """Indicate the download will use the RTMP protocol."""
69ea8ca4 557 self.to_screen('RTMP download detected')
c5e8d7af 558
60064c53
PH
559 def _signature_cache_id(self, example_sig):
560 """ Return a string representation of a signature """
78caa52a 561 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
562
563 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 564 id_m = re.match(
60620368 565 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
cf010131 566 player_url)
c081b35c
PH
567 if not id_m:
568 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
569 player_type = id_m.group('ext')
570 player_id = id_m.group('id')
571
c4417ddb 572 # Read from filesystem cache
60064c53
PH
573 func_id = '%s_%s_%s' % (
574 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 575 assert os.path.basename(func_id) == func_id
a0e07d31 576
69ea8ca4 577 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 578 if cache_spec is not None:
78caa52a 579 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 580
6d1a55a5
PH
581 download_note = (
582 'Downloading player %s' % player_url
583 if self._downloader.params.get('verbose') else
584 'Downloading %s player %s' % (player_type, player_id)
585 )
e0df6211
PH
586 if player_type == 'js':
587 code = self._download_webpage(
588 player_url, video_id,
6d1a55a5 589 note=download_note,
69ea8ca4 590 errnote='Download of %s failed' % player_url)
83799698 591 res = self._parse_sig_js(code)
c4417ddb 592 elif player_type == 'swf':
e0df6211
PH
593 urlh = self._request_webpage(
594 player_url, video_id,
6d1a55a5 595 note=download_note,
69ea8ca4 596 errnote='Download of %s failed' % player_url)
e0df6211 597 code = urlh.read()
83799698 598 res = self._parse_sig_swf(code)
e0df6211
PH
599 else:
600 assert False, 'Invalid player type %r' % player_type
601
785521bf
PH
602 test_string = ''.join(map(compat_chr, range(len(example_sig))))
603 cache_res = res(test_string)
604 cache_spec = [ord(c) for c in cache_res]
83799698 605
69ea8ca4 606 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
607 return res
608
60064c53 609 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
610 def gen_sig_code(idxs):
611 def _genslice(start, end, step):
78caa52a 612 starts = '' if start == 0 else str(start)
8bcc8756 613 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 614 steps = '' if step == 1 else (':%d' % step)
78caa52a 615 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
616
617 step = None
7af808a5
PH
618 # Quelch pyflakes warnings - start will be set when step is set
619 start = '(Never used)'
edf3e38e
PH
620 for i, prev in zip(idxs[1:], idxs[:-1]):
621 if step is not None:
622 if i - prev == step:
623 continue
624 yield _genslice(start, prev, step)
625 step = None
626 continue
627 if i - prev in [-1, 1]:
628 step = i - prev
629 start = prev
630 continue
631 else:
78caa52a 632 yield 's[%d]' % prev
edf3e38e 633 if step is None:
78caa52a 634 yield 's[%d]' % i
edf3e38e
PH
635 else:
636 yield _genslice(start, i, step)
637
78caa52a 638 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 639 cache_res = func(test_string)
edf3e38e 640 cache_spec = [ord(c) for c in cache_res]
78caa52a 641 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
642 signature_id_tuple = '(%s)' % (
643 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 644 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 645 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 646 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 647
e0df6211
PH
648 def _parse_sig_js(self, jscode):
649 funcname = self._search_regex(
aaeb86f6 650 r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode,
8bcc8756 651 'Initial JS player signature function name')
2b25cb5d
PH
652
653 jsi = JSInterpreter(jscode)
654 initial_function = jsi.extract_function(funcname)
e0df6211
PH
655 return lambda s: initial_function([s])
656
657 def _parse_sig_swf(self, file_contents):
54256267 658 swfi = SWFInterpreter(file_contents)
78caa52a 659 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 660 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 661 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
662 return lambda s: initial_function([s])
663
83799698 664 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 665 """Turn the encrypted s field into a working signature"""
6b37f0be 666
c8bf86d5 667 if player_url is None:
69ea8ca4 668 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 669
69ea8ca4 670 if player_url.startswith('//'):
78caa52a 671 player_url = 'https:' + player_url
c8bf86d5 672 try:
62af3a0e 673 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
674 if player_id not in self._player_cache:
675 func = self._extract_signature_function(
60064c53 676 video_id, player_url, s
c8bf86d5
PH
677 )
678 self._player_cache[player_id] = func
679 func = self._player_cache[player_id]
680 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 681 self._print_sig_code(func, s)
c8bf86d5
PH
682 return func(s)
683 except Exception as e:
684 tb = traceback.format_exc()
685 raise ExtractorError(
78caa52a 686 'Signature extraction failed: ' + tb, cause=e)
e0df6211 687
360e1ca5 688 def _get_subtitles(self, video_id, webpage):
de7f3446 689 try:
60e47a26 690 subs_doc = self._download_xml(
38c2e5b8 691 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
692 video_id, note=False)
693 except ExtractorError as err:
69ea8ca4 694 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
de7f3446 695 return {}
de7f3446
JMF
696
697 sub_lang_list = {}
60e47a26
JMF
698 for track in subs_doc.findall('track'):
699 lang = track.attrib['lang_code']
7e660ac1
LD
700 if lang in sub_lang_list:
701 continue
360e1ca5
JMF
702 sub_formats = []
703 for ext in ['sbv', 'vtt', 'srt']:
704 params = compat_urllib_parse.urlencode({
705 'lang': lang,
706 'v': video_id,
707 'fmt': ext,
708 'name': track.attrib['name'].encode('utf-8'),
709 })
710 sub_formats.append({
711 'url': 'https://www.youtube.com/api/timedtext?' + params,
712 'ext': ext,
713 })
714 sub_lang_list[lang] = sub_formats
de7f3446 715 if not sub_lang_list:
69ea8ca4 716 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
717 return {}
718 return sub_lang_list
719
360e1ca5 720 def _get_automatic_captions(self, video_id, webpage):
de7f3446
JMF
721 """We need the webpage for getting the captions url, pass it as an
722 argument to speed up the process."""
69ea8ca4 723 self.to_screen('%s: Looking for automatic captions' % video_id)
de7f3446 724 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
78caa52a 725 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
726 if mobj is None:
727 self._downloader.report_warning(err_msg)
728 return {}
729 player_config = json.loads(mobj.group(1))
730 try:
0792d563
PH
731 args = player_config['args']
732 caption_url = args['ttsurl']
733 timestamp = args['timestamp']
055e6f36
JMF
734 # We get the available subtitles
735 list_params = compat_urllib_parse.urlencode({
736 'type': 'list',
737 'tlangs': 1,
738 'asrs': 1,
de7f3446 739 })
055e6f36 740 list_url = caption_url + '&' + list_params
e26f8712 741 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 742 original_lang_node = caption_list.find('track')
7d900ef1 743 if original_lang_node is None:
69ea8ca4 744 self._downloader.report_warning('Video doesn\'t have automatic captions')
e3dc22ca
JMF
745 return {}
746 original_lang = original_lang_node.attrib['lang_code']
7d900ef1 747 caption_kind = original_lang_node.attrib.get('kind', '')
055e6f36
JMF
748
749 sub_lang_list = {}
750 for lang_node in caption_list.findall('target'):
751 sub_lang = lang_node.attrib['lang_code']
360e1ca5
JMF
752 sub_formats = []
753 for ext in ['sbv', 'vtt', 'srt']:
754 params = compat_urllib_parse.urlencode({
755 'lang': original_lang,
756 'tlang': sub_lang,
757 'fmt': ext,
758 'ts': timestamp,
759 'kind': caption_kind,
760 })
761 sub_formats.append({
762 'url': caption_url + '&' + params,
763 'ext': ext,
764 })
765 sub_lang_list[sub_lang] = sub_formats
055e6f36 766 return sub_lang_list
de7f3446
JMF
767 # An extractor error can be raise by the download process if there are
768 # no automatic captions but there are subtitles
769 except (KeyError, ExtractorError):
770 self._downloader.report_warning(err_msg)
771 return {}
772
97665381
PH
773 @classmethod
774 def extract_id(cls, url):
775 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 776 if mobj is None:
69ea8ca4 777 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
778 video_id = mobj.group(2)
779 return video_id
780
1d043b93
JMF
781 def _extract_from_m3u8(self, manifest_url, video_id):
782 url_map = {}
5f6a1245 783
1d043b93
JMF
784 def _get_urls(_manifest):
785 lines = _manifest.split('\n')
786 urls = filter(lambda l: l and not l.startswith('#'),
8bcc8756 787 lines)
1d043b93 788 return urls
78caa52a 789 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
1d043b93
JMF
790 formats_urls = _get_urls(manifest)
791 for format_url in formats_urls:
890f62e8 792 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
793 url_map[itag] = format_url
794 return url_map
795
1fb07d10
JG
796 def _extract_annotations(self, video_id):
797 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 798 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 799
da276600 800 def _parse_dash_manifest(
77c6fb5b 801 self, video_id, dash_manifest_url, player_url, age_gate, fatal=True):
774e208f
PH
802 def decrypt_sig(mobj):
803 s = mobj.group(1)
804 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
805 return '/signature/%s' % dec_s
e1b9322b 806 dash_manifest_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, dash_manifest_url)
774e208f
PH
807 dash_doc = self._download_xml(
808 dash_manifest_url, video_id,
809 note='Downloading DASH manifest',
77c6fb5b
S
810 errnote='Could not download DASH manifest',
811 fatal=fatal)
812
813 if dash_doc is False:
814 return []
774e208f
PH
815
816 formats = []
de5c5456
YCH
817 for a in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}AdaptationSet'):
818 mime_type = a.attrib.get('mimeType')
819 for r in a.findall('{urn:mpeg:DASH:schema:MPD:2011}Representation'):
820 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
821 if url_el is None:
822 continue
823 if mime_type == 'text/vtt':
824 # TODO implement WebVTT downloading
825 pass
826 elif mime_type.startswith('audio/') or mime_type.startswith('video/'):
827 format_id = r.attrib['id']
828 video_url = url_el.text
829 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
830 f = {
831 'format_id': format_id,
832 'url': video_url,
833 'width': int_or_none(r.attrib.get('width')),
834 'height': int_or_none(r.attrib.get('height')),
835 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
836 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
837 'filesize': filesize,
838 'fps': int_or_none(r.attrib.get('frameRate')),
839 }
840 try:
841 existing_format = next(
842 fo for fo in formats
843 if fo['format_id'] == format_id)
844 except StopIteration:
845 full_info = self._formats.get(format_id, {}).copy()
846 full_info.update(f)
1b5a1ae2
S
847 codecs = r.attrib.get('codecs')
848 if codecs:
849 if full_info.get('acodec') == 'none' and 'vcodec' not in full_info:
850 full_info['vcodec'] = codecs
851 elif full_info.get('vcodec') == 'none' and 'acodec' not in full_info:
852 full_info['acodec'] = codecs
de5c5456
YCH
853 formats.append(full_info)
854 else:
855 existing_format.update(f)
856 else:
857 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
774e208f
PH
858 return formats
859
c5e8d7af 860 def _real_extract(self, url):
7e8c0af0 861 proto = (
78caa52a
PH
862 'http' if self._downloader.params.get('prefer_insecure', False)
863 else 'https')
7e8c0af0 864
c5e8d7af
PH
865 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
866 mobj = re.search(self._NEXT_URL_RE, url)
867 if mobj:
7e8c0af0 868 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
97665381 869 video_id = self.extract_id(url)
c5e8d7af
PH
870
871 # Get video webpage
aa79ac0c 872 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
a1f934b1 873 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
874
875 # Attempt to extract SWF player URL
e0df6211 876 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
877 if mobj is not None:
878 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
879 else:
880 player_url = None
881
d8d24a92
S
882 dash_mpds = []
883
884 def add_dash_mpd(video_info):
885 dash_mpd = video_info.get('dashmpd')
886 if dash_mpd and dash_mpd[0] not in dash_mpds:
887 dash_mpds.append(dash_mpd[0])
888
c5e8d7af 889 # Get video info
6449cd80 890 embed_webpage = None
c108eb73 891 if re.search(r'player-age-gate-content">', video_webpage) is not None:
c108eb73
JMF
892 age_gate = True
893 # We simulate the access to the video from www.youtube.com/v/{video_id}
894 # this can be viewed without login into Youtube
beb95e77
CL
895 url = proto + '://www.youtube.com/embed/%s' % video_id
896 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
2c57c7fa
JMF
897 data = compat_urllib_parse.urlencode({
898 'video_id': video_id,
899 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934 900 'sts': self._search_regex(
beb95e77 901 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
2c57c7fa 902 })
7e8c0af0 903 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
94bd3613
PH
904 video_info_webpage = self._download_webpage(
905 video_info_url, video_id,
20436c30 906 note='Refetching age-gated info webpage',
94bd3613 907 errnote='unable to download video info webpage')
c5e8d7af 908 video_info = compat_parse_qs(video_info_webpage)
d8d24a92 909 add_dash_mpd(video_info)
c108eb73
JMF
910 else:
911 age_gate = False
bc93bdb5 912 video_info = None
d8d24a92
S
913 # Try looking directly into the video webpage
914 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
915 if mobj:
4e62ebe2
JMF
916 json_code = uppercase_escape(mobj.group(1))
917 ytplayer_config = json.loads(json_code)
918 args = ytplayer_config['args']
d8d24a92
S
919 if args.get('url_encoded_fmt_stream_map'):
920 # Convert to the same format returned by compat_parse_qs
921 video_info = dict((k, [v]) for k, v in args.items())
922 add_dash_mpd(video_info)
0a3cf9ad
S
923 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
924 # We also try looking in get_video_info since it may contain different dashmpd
925 # URL that points to a DASH manifest with possibly different itag set (some itags
926 # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
927 # manifest pointed by get_video_info's dashmpd).
928 # The general idea is to take a union of itags of both DASH manifests (for example
929 # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
4e62ebe2 930 self.report_video_info_webpage_download(video_id)
0a3cf9ad 931 for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:
810fb84d
PH
932 video_info_url = (
933 '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
934 % (proto, video_id, el_type))
935 video_info_webpage = self._download_webpage(
936 video_info_url,
4e62ebe2
JMF
937 video_id, note=False,
938 errnote='unable to download video info webpage')
0a3cf9ad
S
939 get_video_info = compat_parse_qs(video_info_webpage)
940 add_dash_mpd(get_video_info)
941 if not video_info:
942 video_info = get_video_info
943 if 'token' in get_video_info:
4e62ebe2 944 break
c5e8d7af
PH
945 if 'token' not in video_info:
946 if 'reason' in video_info:
af214c3a
YCH
947 if 'The uploader has not made this video available in your country.' in video_info['reason']:
948 regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None)
949 if regions_allowed is not None:
950 raise ExtractorError('YouTube said: This video is available in %s only' % (
951 ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))),
952 expected=True)
d11271dd 953 raise ExtractorError(
78caa52a 954 'YouTube said: %s' % video_info['reason'][0],
d11271dd 955 expected=True, video_id=video_id)
c5e8d7af 956 else:
d11271dd 957 raise ExtractorError(
78caa52a 958 '"token" parameter not in video info for unknown reason',
d11271dd 959 video_id=video_id)
c5e8d7af 960
1d699755
PH
961 if 'view_count' in video_info:
962 view_count = int(video_info['view_count'][0])
963 else:
964 view_count = None
965
c5e8d7af
PH
966 # Check for "rental" videos
967 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
69ea8ca4 968 raise ExtractorError('"rental" videos not supported')
c5e8d7af
PH
969
970 # Start extracting information
971 self.report_information_extraction(video_id)
972
973 # uploader
974 if 'author' not in video_info:
69ea8ca4 975 raise ExtractorError('Unable to extract uploader name')
c5e8d7af
PH
976 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
977
978 # uploader_id
979 video_uploader_id = None
980 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
981 if mobj is not None:
982 video_uploader_id = mobj.group(1)
983 else:
69ea8ca4 984 self._downloader.report_warning('unable to extract uploader nickname')
c5e8d7af
PH
985
986 # title
a8c6b241 987 if 'title' in video_info:
aa92f063 988 video_title = video_info['title'][0]
a8c6b241 989 else:
69ea8ca4 990 self._downloader.report_warning('Unable to extract video title')
78caa52a 991 video_title = '_'
c5e8d7af
PH
992
993 # thumbnail image
7763b04e
JMF
994 # We try first to get a high quality image:
995 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
996 video_webpage, re.DOTALL)
997 if m_thumb is not None:
998 video_thumbnail = m_thumb.group(1)
999 elif 'thumbnail_url' not in video_info:
69ea8ca4 1000 self._downloader.report_warning('unable to extract video thumbnail')
f490e77e 1001 video_thumbnail = None
c5e8d7af
PH
1002 else: # don't panic if we can't find it
1003 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1004
1005 # upload date
9d0b581f
S
1006 upload_date = self._html_search_meta(
1007 'datePublished', video_webpage, 'upload date', default=None)
1008 if not upload_date:
1009 upload_date = self._search_regex(
1010 [r'(?s)id="eow-date.*?>(.*?)</span>',
1011 r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'],
1012 video_webpage, 'upload date', default=None)
1013 if upload_date:
1014 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1015 upload_date = unified_strdate(upload_date)
c5e8d7af 1016
55f7bd2d
PH
1017 m_cat_container = self._search_regex(
1018 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
624dcebf 1019 video_webpage, 'categories', default=None)
ec8deefc 1020 if m_cat_container:
ad3bc6ac 1021 category = self._html_search_regex(
01ed5c9b 1022 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
1023 default=None)
1024 video_categories = None if category is None else [category]
1025 else:
1026 video_categories = None
ec8deefc 1027
c5e8d7af
PH
1028 # description
1029 video_description = get_element_by_id("eow-description", video_webpage)
1030 if video_description:
27dcce19
PH
1031 video_description = re.sub(r'''(?x)
1032 <a\s+
1033 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1034 title="([^"]+)"\s+
1035 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1036 class="yt-uix-redirect-link"\s*>
1037 [^<]+
1038 </a>
1039 ''', r'\1', video_description)
c5e8d7af
PH
1040 video_description = clean_html(video_description)
1041 else:
1042 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1043 if fd_mobj:
1044 video_description = unescapeHTML(fd_mobj.group(1))
1045 else:
78caa52a 1046 video_description = ''
c5e8d7af 1047
f30a38be 1048 def _extract_count(count_name):
c93d53f5
S
1049 return str_to_int(self._search_regex(
1050 r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
1051 % re.escape(count_name),
1052 video_webpage, count_name, default=None))
1053
69ea8ca4
PH
1054 like_count = _extract_count('like')
1055 dislike_count = _extract_count('dislike')
336c3a69 1056
c5e8d7af 1057 # subtitles
d82134c3 1058 video_subtitles = self.extract_subtitles(video_id, video_webpage)
360e1ca5 1059 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
c5e8d7af
PH
1060
1061 if 'length_seconds' not in video_info:
69ea8ca4 1062 self._downloader.report_warning('unable to extract video duration')
b466b702 1063 video_duration = None
c5e8d7af 1064 else:
b466b702 1065 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1066
1fb07d10
JG
1067 # annotations
1068 video_annotations = None
1069 if self._downloader.params.get('writeannotations', False):
5f6a1245 1070 video_annotations = self._extract_annotations(video_id)
1fb07d10 1071
dd27fd17
PH
1072 def _map_to_format_list(urlmap):
1073 formats = []
1074 for itag, video_real_url in urlmap.items():
1075 dct = {
1076 'format_id': itag,
1077 'url': video_real_url,
1078 'player_url': player_url,
1079 }
0b65e5d4
PH
1080 if itag in self._formats:
1081 dct.update(self._formats[itag])
dd27fd17
PH
1082 formats.append(dct)
1083 return formats
1084
c5e8d7af
PH
1085 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1086 self.report_rtmp_download()
dd27fd17
PH
1087 formats = [{
1088 'format_id': '_rtmp',
1089 'protocol': 'rtmp',
1090 'url': video_info['conn'][0],
1091 'player_url': player_url,
1092 }]
24270b03 1093 elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
5f6a1245 1094 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 1095 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1096 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1097 url_map = {}
00fe14fc 1098 for url_data_str in encoded_url_map.split(','):
c5e8d7af 1099 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
1100 if 'itag' not in url_data or 'url' not in url_data:
1101 continue
1102 format_id = url_data['itag'][0]
1103 url = url_data['url'][0]
1104
1105 if 'sig' in url_data:
1106 url += '&signature=' + url_data['sig'][0]
1107 elif 's' in url_data:
1108 encrypted_sig = url_data['s'][0]
6449cd80 1109 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
201e9eaa 1110
beb95e77 1111 jsplayer_url_json = self._search_regex(
6449cd80
PH
1112 ASSETS_RE,
1113 embed_webpage if age_gate else video_webpage,
1114 'JS player URL (1)', default=None)
1115 if not jsplayer_url_json and not age_gate:
1116 # We need the embed website after all
1117 if embed_webpage is None:
1118 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1119 embed_webpage = self._download_webpage(
1120 embed_url, video_id, 'Downloading embed webpage')
1121 jsplayer_url_json = self._search_regex(
1122 ASSETS_RE, embed_webpage, 'JS player URL')
1123
beb95e77 1124 player_url = json.loads(jsplayer_url_json)
201e9eaa
PH
1125 if player_url is None:
1126 player_url_json = self._search_regex(
1127 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 1128 video_webpage, 'age gate player URL')
201e9eaa
PH
1129 player_url = json.loads(player_url_json)
1130
1131 if self._downloader.params.get('verbose'):
cf010131 1132 if player_url is None:
201e9eaa
PH
1133 player_version = 'unknown'
1134 player_desc = 'unknown'
1135 else:
1136 if player_url.endswith('swf'):
1137 player_version = self._search_regex(
1138 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 1139 'flash player', fatal=False)
201e9eaa 1140 player_desc = 'flash player %s' % player_version
cf010131 1141 else:
201e9eaa
PH
1142 player_version = self._search_regex(
1143 r'html5player-([^/]+?)(?:/html5player)?\.js',
1144 player_url,
1145 'html5 player', fatal=False)
78caa52a 1146 player_desc = 'html5 player %s' % player_version
201e9eaa 1147
60064c53 1148 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 1149 self.to_screen('{%s} signature length %s, %s' %
9e1a5b84 1150 (format_id, parts_sizes, player_desc))
201e9eaa
PH
1151
1152 signature = self._decrypt_signature(
1153 encrypted_sig, video_id, player_url, age_gate)
1154 url += '&signature=' + signature
1155 if 'ratebypass' not in url:
1156 url += '&ratebypass=yes'
1157 url_map[format_id] = url
dd27fd17 1158 formats = _map_to_format_list(url_map)
1d043b93
JMF
1159 elif video_info.get('hlsvp'):
1160 manifest_url = video_info['hlsvp'][0]
1161 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1162 formats = _map_to_format_list(url_map)
c5e8d7af 1163 else:
69ea8ca4 1164 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1165
dd27fd17 1166 # Look for the DASH manifest
203fb43f 1167 if self._downloader.params.get('youtube_include_dash_manifest', True):
77c6fb5b 1168 dash_mpd_fatal = True
d8d24a92
S
1169 for dash_manifest_url in dash_mpds:
1170 dash_formats = {}
774e208f 1171 try:
d8d24a92 1172 for df in self._parse_dash_manifest(
77c6fb5b 1173 video_id, dash_manifest_url, player_url, age_gate, dash_mpd_fatal):
d8d24a92
S
1174 # Do not overwrite DASH format found in some previous DASH manifest
1175 if df['format_id'] not in dash_formats:
1176 dash_formats[df['format_id']] = df
77c6fb5b
S
1177 # Additional DASH manifests may end up in HTTP Error 403 therefore
1178 # allow them to fail without bug report message if we already have
1179 # some DASH manifest succeeded. This is temporary workaround to reduce
1180 # burst of bug reports until we figure out the reason and whether it
1181 # can be fixed at all.
1182 dash_mpd_fatal = False
774e208f
PH
1183 except (ExtractorError, KeyError) as e:
1184 self.report_warning(
1185 'Skipping DASH manifest: %r' % e, video_id)
d8d24a92 1186 if dash_formats:
04b3b3df
JMF
1187 # Remove the formats we found through non-DASH, they
1188 # contain less info and it can be wrong, because we use
1189 # fixed values (for example the resolution). See
1190 # https://github.com/rg3/youtube-dl/issues/5774 for an
1191 # example.
d80265cc 1192 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
d8d24a92 1193 formats.extend(dash_formats.values())
d80044c2 1194
6271f1ca
PH
1195 # Check for malformed aspect ratio
1196 stretched_m = re.search(
1197 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
1198 video_webpage)
1199 if stretched_m:
1200 ratio = float(stretched_m.group('w')) / float(stretched_m.group('h'))
1201 for f in formats:
1202 if f.get('vcodec') != 'none':
1203 f['stretched_ratio'] = ratio
1204
4bcc7bd1 1205 self._sort_formats(formats)
4ea3be0a 1206
1207 return {
8bcc8756
JW
1208 'id': video_id,
1209 'uploader': video_uploader,
1210 'uploader_id': video_uploader_id,
1211 'upload_date': upload_date,
1212 'title': video_title,
1213 'thumbnail': video_thumbnail,
1214 'description': video_description,
1215 'categories': video_categories,
1216 'subtitles': video_subtitles,
360e1ca5 1217 'automatic_captions': automatic_captions,
8bcc8756
JW
1218 'duration': video_duration,
1219 'age_limit': 18 if age_gate else 0,
1220 'annotations': video_annotations,
7e8c0af0 1221 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 1222 'view_count': view_count,
4ea3be0a 1223 'like_count': like_count,
1224 'dislike_count': dislike_count,
2d30521a 1225 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
8bcc8756 1226 'formats': formats,
4ea3be0a 1227 }
c5e8d7af 1228
5f6a1245 1229
880e1c52 1230class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
78caa52a 1231 IE_DESC = 'YouTube.com playlists'
d67cc9fa 1232 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1233 (?:https?://)?
1234 (?:\w+\.)?
1235 youtube\.com/
1236 (?:
ac7553d0 1237 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
c5e8d7af
PH
1238 \? (?:.*?&)*? (?:p|a|list)=
1239 | p/
1240 )
d67cc9fa 1241 (
99209c29 1242 (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,}
5f6a1245 1243 # Top tracks, they can also include dots
d67cc9fa
JMF
1244 |(?:MC)[\w\.]*
1245 )
c5e8d7af
PH
1246 .*
1247 |
99209c29 1248 ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})
c5e8d7af 1249 )"""
dbb94fb0 1250 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dbb94fb0 1251 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
78caa52a 1252 IE_NAME = 'youtube:playlist'
81127aa5
PH
1253 _TESTS = [{
1254 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1255 'info_dict': {
1256 'title': 'ytdl test PL',
a1cf99d0 1257 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
81127aa5
PH
1258 },
1259 'playlist_count': 3,
9291475f
PH
1260 }, {
1261 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1262 'info_dict': {
acf757f4 1263 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
9291475f
PH
1264 'title': 'YDL_Empty_List',
1265 },
1266 'playlist_count': 0,
1267 }, {
1268 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1269 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1270 'info_dict': {
1271 'title': '29C3: Not my department',
acf757f4 1272 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
9291475f
PH
1273 },
1274 'playlist_count': 95,
1275 }, {
1276 'note': 'issue #673',
1277 'url': 'PLBB231211A4F62143',
1278 'info_dict': {
f46a8702 1279 'title': '[OLD]Team Fortress 2 (Class-based LP)',
acf757f4 1280 'id': 'PLBB231211A4F62143',
9291475f
PH
1281 },
1282 'playlist_mincount': 26,
1283 }, {
1284 'note': 'Large playlist',
1285 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1286 'info_dict': {
1287 'title': 'Uploads from Cauchemar',
acf757f4 1288 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
9291475f
PH
1289 },
1290 'playlist_mincount': 799,
1291 }, {
1292 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1293 'info_dict': {
1294 'title': 'YDL_safe_search',
acf757f4 1295 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
9291475f
PH
1296 },
1297 'playlist_count': 2,
ac7553d0
PH
1298 }, {
1299 'note': 'embedded',
1300 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1301 'playlist_count': 4,
1302 'info_dict': {
1303 'title': 'JODA15',
acf757f4 1304 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
ac7553d0 1305 }
6b08cdf6
PH
1306 }, {
1307 'note': 'Embedded SWF player',
1308 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1309 'playlist_count': 4,
1310 'info_dict': {
1311 'title': 'JODA7',
acf757f4 1312 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
6b08cdf6 1313 }
4b7df0d3
JMF
1314 }, {
1315 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
1316 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
1317 'info_dict': {
acf757f4
PH
1318 'title': 'Uploads from Interstellar Movie',
1319 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
4b7df0d3
JMF
1320 },
1321 'playlist_mincout': 21,
81127aa5 1322 }]
c5e8d7af 1323
880e1c52
JMF
1324 def _real_initialize(self):
1325 self._login()
1326
652cdaa2 1327 def _extract_mix(self, playlist_id):
99209c29 1328 # The mixes are generated from a single video
652cdaa2 1329 # the id of the playlist is just 'RD' + video_id
7d4afc55 1330 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
c9cc0bf5 1331 webpage = self._download_webpage(
78caa52a 1332 url, playlist_id, 'Downloading Youtube mix')
bc2f773b 1333 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
1334 title_span = (
1335 search_title('playlist-title') or
1336 search_title('title long-title') or
1337 search_title('title'))
76d1700b 1338 title = clean_html(title_span)
c9cc0bf5
PH
1339 ids = orderedSet(re.findall(
1340 r'''(?xs)data-video-username=".*?".*?
1341 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1342 webpage))
652cdaa2
JMF
1343 url_results = self._ids_to_results(ids)
1344
1345 return self.playlist_result(url_results, playlist_id, title)
1346
448830ce 1347 def _extract_playlist(self, playlist_id):
dbb94fb0
S
1348 url = self._TEMPLATE_URL % playlist_id
1349 page = self._download_webpage(url, playlist_id)
dbb94fb0 1350
39b62db1
YCH
1351 for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page):
1352 match = match.strip()
1353 # Check if the playlist exists or is private
1354 if re.match(r'[^<]*(The|This) playlist (does not exist|is private)[^<]*', match):
1355 raise ExtractorError(
1356 'The playlist doesn\'t exist or is private, use --username or '
1357 '--netrc to access it.',
1358 expected=True)
1359 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
1360 raise ExtractorError(
1361 'Invalid parameters. Maybe URL is incorrect.',
1362 expected=True)
1363 elif re.match(r'[^<]*Choose your language[^<]*', match):
1364 continue
1365 else:
1366 self.report_warning('Youtube gives an alert message: ' + match)
10c0e2d8 1367
dcbb4580 1368 # Extract the video ids from the playlist pages
70219b0f
JMF
1369 def _entries():
1370 more_widget_html = content_html = page
1371 for page_num in itertools.count(1):
1372 matches = re.finditer(self._VIDEO_RE, content_html)
1373 # We remove the duplicates and the link with index 0
1374 # (it's not the first video of the playlist)
1375 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1376 for vid_id in new_ids:
1377 yield self.url_result(vid_id, 'Youtube', video_id=vid_id)
1378
1379 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1380 if not mobj:
1381 break
1382
1383 more = self._download_json(
1384 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1385 'Downloading page #%s' % page_num,
1386 transform_source=uppercase_escape)
1387 content_html = more['content_html']
1388 if not content_html.strip():
1389 # Some webpages show a "Load more" button but they don't
1390 # have more videos
1391 break
1392 more_widget_html = more['load_more_widget_html']
dbb94fb0
S
1393
1394 playlist_title = self._html_search_regex(
68eb8e90 1395 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
78caa52a 1396 page, 'title')
c5e8d7af 1397
70219b0f 1398 return self.playlist_result(_entries(), playlist_id, playlist_title)
c5e8d7af 1399
448830ce
S
1400 def _real_extract(self, url):
1401 # Extract playlist id
1402 mobj = re.match(self._VALID_URL, url)
1403 if mobj is None:
1404 raise ExtractorError('Invalid URL: %s' % url)
1405 playlist_id = mobj.group(1) or mobj.group(2)
1406
1407 # Check if it's a video-specific URL
1408 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1409 if 'v' in query_dict:
1410 video_id = query_dict['v'][0]
1411 if self._downloader.params.get('noplaylist'):
1412 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1413 return self.url_result(video_id, 'Youtube', video_id=video_id)
1414 else:
1415 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1416
1417 if playlist_id.startswith('RD') or playlist_id.startswith('UL'):
1418 # Mixes require a custom extraction process
1419 return self._extract_mix(playlist_id)
1420
1421 return self._extract_playlist(playlist_id)
1422
c5e8d7af
PH
1423
1424class YoutubeChannelIE(InfoExtractor):
78caa52a 1425 IE_DESC = 'YouTube.com channels'
9ff67727 1426 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
eb0f3e7e 1427 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
78caa52a 1428 IE_NAME = 'youtube:channel'
cdc628a4
PH
1429 _TESTS = [{
1430 'note': 'paginated channel',
1431 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1432 'playlist_mincount': 91,
acf757f4
PH
1433 'info_dict': {
1434 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
1435 }
cdc628a4 1436 }]
c5e8d7af 1437
6de5dbaf
S
1438 @staticmethod
1439 def extract_videos_from_page(page):
c5e8d7af 1440 ids_in_page = []
fb69240c
S
1441 titles_in_page = []
1442 for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page):
1443 video_id = mobj.group('id')
1444 video_title = unescapeHTML(mobj.group('title'))
1445 try:
1446 idx = ids_in_page.index(video_id)
1447 if video_title and not titles_in_page[idx]:
1448 titles_in_page[idx] = video_title
1449 except ValueError:
1450 ids_in_page.append(video_id)
1451 titles_in_page.append(video_title)
1452 return zip(ids_in_page, titles_in_page)
c5e8d7af
PH
1453
1454 def _real_extract(self, url):
9ff67727 1455 channel_id = self._match_id(url)
c5e8d7af 1456
eb0f3e7e 1457 url = self._TEMPLATE_URL % channel_id
386bdfa6
S
1458
1459 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
1460 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
1461 # otherwise fallback on channel by page extraction
1462 channel_page = self._download_webpage(
1463 url + '?view=57', channel_id,
1464 'Downloading channel page', fatal=False)
3d8e9573
S
1465 channel_playlist_id = self._html_search_meta(
1466 'channelId', channel_page, 'channel id', default=None)
1467 if not channel_playlist_id:
1468 channel_playlist_id = self._search_regex(
1469 r'data-channel-external-id="([^"]+)"',
1470 channel_page, 'channel id', default=None)
386bdfa6
S
1471 if channel_playlist_id and channel_playlist_id.startswith('UC'):
1472 playlist_id = 'UU' + channel_playlist_id[2:]
d2a9de78
IK
1473 return self.url_result(
1474 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
386bdfa6 1475
60bf45c8 1476 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
31812a9e
PH
1477 autogenerated = re.search(r'''(?x)
1478 class="[^"]*?(?:
1479 channel-header-autogenerated-label|
1480 yt-channel-title-autogenerated
1481 )[^"]*"''', channel_page) is not None
c5e8d7af 1482
b9643eed
JMF
1483 if autogenerated:
1484 # The videos are contained in a single page
1485 # the ajax pages can't be used, they are empty
b82f815f 1486 entries = [
fb69240c
S
1487 self.url_result(
1488 video_id, 'Youtube', video_id=video_id,
1489 video_title=video_title)
8f02ad4f 1490 for video_id, video_title in self.extract_videos_from_page(channel_page)]
b82f815f
PH
1491 return self.playlist_result(entries, channel_id)
1492
1493 def _entries():
23d3608c 1494 more_widget_html = content_html = channel_page
b9643eed 1495 for pagenum in itertools.count(1):
81c2f20b 1496
8f02ad4f 1497 for video_id, video_title in self.extract_videos_from_page(content_html):
b82f815f 1498 yield self.url_result(
fb69240c
S
1499 video_id, 'Youtube', video_id=video_id,
1500 video_title=video_title)
5f6a1245 1501
23d3608c
JMF
1502 mobj = re.search(
1503 r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
1504 more_widget_html)
1505 if not mobj:
b9643eed 1506 break
c5e8d7af 1507
23d3608c
JMF
1508 more = self._download_json(
1509 'https://youtube.com/%s' % mobj.group('more'), channel_id,
1510 'Downloading page #%s' % (pagenum + 1),
1511 transform_source=uppercase_escape)
1512 content_html = more['content_html']
1513 more_widget_html = more['load_more_widget_html']
1514
b82f815f 1515 return self.playlist_result(_entries(), channel_id)
c5e8d7af
PH
1516
1517
eb0f3e7e 1518class YoutubeUserIE(YoutubeChannelIE):
78caa52a 1519 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
9ff67727 1520 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
eb0f3e7e 1521 _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos'
78caa52a 1522 IE_NAME = 'youtube:user'
c5e8d7af 1523
cdc628a4
PH
1524 _TESTS = [{
1525 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1526 'playlist_mincount': 320,
1527 'info_dict': {
1528 'title': 'TheLinuxFoundation',
1529 }
1530 }, {
1531 'url': 'ytuser:phihag',
1532 'only_matching': True,
1533 }]
1534
e3ea4790 1535 @classmethod
f4b05232 1536 def suitable(cls, url):
e3ea4790
JMF
1537 # Don't return True if the url can be extracted with other youtube
1538 # extractor, the regex would is too permissive and it would match.
1539 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
5f6a1245
JW
1540 if any(ie.suitable(url) for ie in other_ies):
1541 return False
1542 else:
1543 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 1544
b05654f0 1545
b4c08069 1546class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
78caa52a 1547 IE_DESC = 'YouTube.com searches'
b4c08069
JMF
1548 # there doesn't appear to be a real limit, for example if you search for
1549 # 'python' you get more than 8.000.000 results
1550 _MAX_RESULTS = float('inf')
78caa52a 1551 IE_NAME = 'youtube:search'
b05654f0 1552 _SEARCH_KEY = 'ytsearch'
b4c08069 1553 _EXTRA_QUERY_ARGS = {}
9dd8e46a 1554 _TESTS = []
b05654f0 1555
b05654f0
PH
1556 def _get_n_results(self, query, n):
1557 """Get a specified number of results for a query"""
1558
b4c08069 1559 videos = []
b05654f0
PH
1560 limit = n
1561
b4c08069
JMF
1562 for pagenum in itertools.count(1):
1563 url_query = {
02175a79 1564 'search_query': query.encode('utf-8'),
b4c08069
JMF
1565 'page': pagenum,
1566 'spf': 'navigate',
1567 }
1568 url_query.update(self._EXTRA_QUERY_ARGS)
1569 result_url = 'https://www.youtube.com/results?' + compat_urllib_parse.urlencode(url_query)
1570 data = self._download_json(
69ea8ca4 1571 result_url, video_id='query "%s"' % query,
b4c08069 1572 note='Downloading page %s' % pagenum,
69ea8ca4 1573 errnote='Unable to download API page')
b4c08069 1574 html_content = data[1]['body']['content']
7cc3570e 1575
b4c08069 1576 if 'class="search-message' in html_content:
07ad22b8 1577 raise ExtractorError(
78caa52a 1578 '[youtube] No video results', expected=True)
b05654f0 1579
b4c08069
JMF
1580 new_videos = self._ids_to_results(orderedSet(re.findall(
1581 r'href="/watch\?v=(.{11})', html_content)))
1582 videos += new_videos
1583 if not new_videos or len(videos) > limit:
1584 break
b05654f0 1585
b4c08069
JMF
1586 if len(videos) > n:
1587 videos = videos[:n]
b05654f0 1588 return self.playlist_result(videos, query)
75dff0ee 1589
c9ae7b95 1590
a3dd9248 1591class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1592 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248 1593 _SEARCH_KEY = 'ytsearchdate'
78caa52a 1594 IE_DESC = 'YouTube.com searches, newest videos first'
b4c08069 1595 _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
75dff0ee 1596
c9ae7b95
PH
1597
1598class YoutubeSearchURLIE(InfoExtractor):
78caa52a
PH
1599 IE_DESC = 'YouTube.com search URLs'
1600 IE_NAME = 'youtube:search_url'
c9ae7b95 1601 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
1602 _TESTS = [{
1603 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1604 'playlist_mincount': 5,
1605 'info_dict': {
1606 'title': 'youtube-dl test video',
1607 }
1608 }]
c9ae7b95
PH
1609
1610 def _real_extract(self, url):
1611 mobj = re.match(self._VALID_URL, url)
1612 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1613
1614 webpage = self._download_webpage(url, query)
1615 result_code = self._search_regex(
98998cde 1616 r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML')
c9ae7b95
PH
1617
1618 part_codes = re.findall(
1619 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1620 entries = []
1621 for part_code in part_codes:
1622 part_title = self._html_search_regex(
6feb2d5e 1623 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1624 part_url_snippet = self._html_search_regex(
1625 r'(?s)href="([^"]+)"', part_code, 'item URL')
1626 part_url = compat_urlparse.urljoin(
1627 'https://www.youtube.com/', part_url_snippet)
1628 entries.append({
1629 '_type': 'url',
1630 'url': part_url,
1631 'title': part_title,
1632 })
1633
1634 return {
1635 '_type': 'playlist',
1636 'entries': entries,
1637 'title': query,
1638 }
1639
1640
75dff0ee 1641class YoutubeShowIE(InfoExtractor):
78caa52a 1642 IE_DESC = 'YouTube.com (multi-season) shows'
cdc628a4 1643 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 1644 IE_NAME = 'youtube:show'
cdc628a4
PH
1645 _TESTS = [{
1646 'url': 'http://www.youtube.com/show/airdisasters',
1647 'playlist_mincount': 3,
1648 'info_dict': {
1649 'id': 'airdisasters',
1650 'title': 'Air Disasters',
1651 }
1652 }]
75dff0ee
JMF
1653
1654 def _real_extract(self, url):
1655 mobj = re.match(self._VALID_URL, url)
cdc628a4
PH
1656 playlist_id = mobj.group('id')
1657 webpage = self._download_webpage(
1658 url, playlist_id, 'Downloading show webpage')
75dff0ee
JMF
1659 # There's one playlist for each season of the show
1660 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
cdc628a4
PH
1661 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1662 entries = [
1663 self.url_result(
1664 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1665 for season in m_seasons
1666 ]
1667 title = self._og_search_title(webpage, fatal=False)
1668
1669 return {
1670 '_type': 'playlist',
1671 'id': playlist_id,
1672 'title': title,
1673 'entries': entries,
1674 }
04cc9617
JMF
1675
1676
b2e8bc1b 1677class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639 1678 """
25f14e9f 1679 Base class for feed extractors
d7ae0639
JMF
1680 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1681 """
b2e8bc1b 1682 _LOGIN_REQUIRED = True
d7ae0639
JMF
1683
1684 @property
1685 def IE_NAME(self):
78caa52a 1686 return 'youtube:%s' % self._FEED_NAME
04cc9617 1687
81f0259b 1688 def _real_initialize(self):
b2e8bc1b 1689 self._login()
81f0259b 1690
04cc9617 1691 def _real_extract(self, url):
25f14e9f
S
1692 page = self._download_webpage(
1693 'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)
2bc43303
JMF
1694
1695 # The extraction process is the same as for playlists, but the regex
1696 # for the video ids doesn't contain an index
1697 ids = []
1698 more_widget_html = content_html = page
2bc43303
JMF
1699 for page_num in itertools.count(1):
1700 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
62c95fd5
S
1701
1702 # 'recommended' feed has infinite 'load more' and each new portion spins
1703 # the same videos in (sometimes) slightly different order, so we'll check
1704 # for unicity and break when portion has no new videos
1705 new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))
1706 if not new_ids:
1707 break
1708
2bc43303
JMF
1709 ids.extend(new_ids)
1710
1711 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1712 if not mobj:
1713 break
1714
1715 more = self._download_json(
25f14e9f 1716 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
2bc43303
JMF
1717 'Downloading page #%s' % page_num,
1718 transform_source=uppercase_escape)
1719 content_html = more['content_html']
1720 more_widget_html = more['load_more_widget_html']
1721
25f14e9f
S
1722 return self.playlist_result(
1723 self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE)
1724
1725
1726class YoutubeWatchLaterIE(YoutubePlaylistIE):
1727 IE_NAME = 'youtube:watchlater'
1728 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
1729 _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'
1730
1731 _TESTS = [] # override PlaylistIE tests
1732
1733 def _real_extract(self, url):
1734 return self._extract_playlist('WL')
f459d170 1735
5f6a1245 1736
c626a3d9 1737class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 1738 IE_NAME = 'youtube:favorites'
f3a34072 1739 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
c7a7750d 1740 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1741 _LOGIN_REQUIRED = True
1742
1743 def _real_extract(self, url):
1744 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 1745 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 1746 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1747
1748
25f14e9f
S
1749class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1750 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
1751 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1752 _FEED_NAME = 'recommended'
1753 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1ed5b5c9 1754
1ed5b5c9 1755
25f14e9f
S
1756class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1757 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1758 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1759 _FEED_NAME = 'subscriptions'
1760 _PLAYLIST_TITLE = 'Youtube Subscriptions'
1ed5b5c9 1761
1ed5b5c9 1762
25f14e9f
S
1763class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1764 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
1765 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1766 _FEED_NAME = 'history'
1767 _PLAYLIST_TITLE = 'Youtube History'
1ed5b5c9
JMF
1768
1769
15870e90
PH
1770class YoutubeTruncatedURLIE(InfoExtractor):
1771 IE_NAME = 'youtube:truncated_url'
1772 IE_DESC = False # Do not list
975d35db 1773 _VALID_URL = r'''(?x)
b95aab84
PH
1774 (?:https?://)?
1775 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
1776 (?:watch\?(?:
c4808c60 1777 feature=[a-z_]+|
b95aab84
PH
1778 annotation_id=annotation_[^&]+|
1779 x-yt-cl=[0-9]+|
c1708b89 1780 hl=[^&]*|
b95aab84
PH
1781 )?
1782 |
1783 attribution_link\?a=[^&]+
1784 )
1785 $
975d35db 1786 '''
15870e90 1787
c4808c60
PH
1788 _TESTS = [{
1789 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1790 'only_matching': True,
dc2fc736
PH
1791 }, {
1792 'url': 'http://www.youtube.com/watch?',
1793 'only_matching': True,
b95aab84
PH
1794 }, {
1795 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
1796 'only_matching': True,
1797 }, {
1798 'url': 'https://www.youtube.com/watch?feature=foo',
1799 'only_matching': True,
c1708b89
PH
1800 }, {
1801 'url': 'https://www.youtube.com/watch?hl=en-GB',
1802 'only_matching': True,
c4808c60
PH
1803 }]
1804
15870e90
PH
1805 def _real_extract(self, url):
1806 raise ExtractorError(
78caa52a
PH
1807 'Did you forget to quote the URL? Remember that & is a meta '
1808 'character in most shells, so you want to put the URL in quotes, '
1809 'like youtube-dl '
1810 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1811 ' or simply youtube-dl BaW_jenozKc .',
15870e90 1812 expected=True)
772fd5cc
PH
1813
1814
1815class YoutubeTruncatedIDIE(InfoExtractor):
1816 IE_NAME = 'youtube:truncated_id'
1817 IE_DESC = False # Do not list
b95aab84 1818 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
1819
1820 _TESTS = [{
1821 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
1822 'only_matching': True,
1823 }]
1824
1825 def _real_extract(self, url):
1826 video_id = self._match_id(url)
1827 raise ExtractorError(
1828 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
1829 expected=True)