]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[tumblr] Improve downloading notes
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
42939b61 10import time
e0df6211 11import traceback
c5e8d7af 12
b05654f0 13from .common import InfoExtractor, SearchInfoExtractor
2b25cb5d 14from ..jsinterp import JSInterpreter
54256267 15from ..swfinterp import SWFInterpreter
4bb4a188 16from ..compat import (
edf3e38e 17 compat_chr,
c5e8d7af 18 compat_parse_qs,
c5e8d7af 19 compat_urllib_parse,
7fd002c0
S
20 compat_urllib_parse_unquote,
21 compat_urllib_parse_unquote_plus,
c5e8d7af 22 compat_urllib_request,
7c61bd36 23 compat_urlparse,
c5e8d7af 24 compat_str,
4bb4a188
PH
25)
26from ..utils import (
c5e8d7af 27 clean_html,
c5e8d7af 28 ExtractorError,
2d30521a 29 float_or_none,
4bb4a188
PH
30 get_element_by_attribute,
31 get_element_by_id,
dd27fd17 32 int_or_none,
4bb4a188 33 orderedSet,
c93d53f5 34 str_to_int,
c5e8d7af
PH
35 unescapeHTML,
36 unified_strdate,
81c2f20b 37 uppercase_escape,
af214c3a 38 ISO3166Utils,
c5e8d7af
PH
39)
40
5f6a1245 41
de7f3446 42class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
43 """Provide base functions for Youtube extractors"""
44 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
83317f69 45 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
b2e8bc1b
JMF
46 _NETRC_MACHINE = 'youtube'
47 # If True it will raise an error if no login info is provided
48 _LOGIN_REQUIRED = False
49
b2e8bc1b 50 def _set_language(self):
810fb84d
PH
51 self._set_cookie(
52 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
42939b61 53 # YouTube sets the expire time to about two months
810fb84d 54 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b 55
25f14e9f
S
56 def _ids_to_results(self, ids):
57 return [
58 self.url_result(vid_id, 'Youtube', video_id=vid_id)
59 for vid_id in ids]
60
b2e8bc1b 61 def _login(self):
83317f69 62 """
63 Attempt to log in to YouTube.
64 True is returned if successful or skipped.
65 False is returned if login failed.
66
67 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
68 """
b2e8bc1b
JMF
69 (username, password) = self._get_login_info()
70 # No authentication to be performed
71 if username is None:
72 if self._LOGIN_REQUIRED:
69ea8ca4 73 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 74 return True
b2e8bc1b 75
7cc3570e
PH
76 login_page = self._download_webpage(
77 self._LOGIN_URL, None,
69ea8ca4
PH
78 note='Downloading login page',
79 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
80 if login_page is False:
81 return
b2e8bc1b 82
795f28f8 83 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78caa52a 84 login_page, 'Login GALX parameter')
c5e8d7af 85
b2e8bc1b
JMF
86 # Log in
87 login_form_strs = {
8bcc8756
JW
88 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
89 'Email': username,
90 'GALX': galx,
91 'Passwd': password,
92
93 'PersistentCookie': 'yes',
94 '_utf8': '霱',
95 'bgresponse': 'js_disabled',
96 'checkConnection': '',
97 'checkedDomains': 'youtube',
98 'dnConn': '',
99 'pstMsg': '0',
100 'rmShown': '1',
101 'secTok': '',
102 'signIn': 'Sign in',
103 'timeStmp': '',
104 'service': 'youtube',
105 'uilel': '3',
106 'hl': 'en_US',
b2e8bc1b 107 }
83317f69 108
b2e8bc1b
JMF
109 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
110 # chokes on unicode
5f6a1245 111 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
b2e8bc1b 112 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
113
114 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
115 login_results = self._download_webpage(
116 req, None,
69ea8ca4 117 note='Logging in', errnote='unable to log in', fatal=False)
7cc3570e
PH
118 if login_results is False:
119 return False
83317f69 120
121 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
69ea8ca4 122 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
83317f69 123
124 # Two-Factor
125 # TODO add SMS and phone call support - these require making a request and then prompting the user
126
127 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
128 tfa_code = self._get_tfa_info()
129
130 if tfa_code is None:
69ea8ca4
PH
131 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
132 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
83317f69 133 return False
134
135 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
136
137 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
138 if match is None:
69ea8ca4 139 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
83317f69 140 secTok = match.group(1)
141 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
142 if match is None:
69ea8ca4 143 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
83317f69 144 timeStmp = match.group(1)
145
146 tfa_form_strs = {
78caa52a
PH
147 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
148 'smsToken': '',
149 'smsUserPin': tfa_code,
150 'smsVerifyPin': 'Verify',
151
152 'PersistentCookie': 'yes',
153 'checkConnection': '',
154 'checkedDomains': 'youtube',
155 'pstMsg': '1',
156 'secTok': secTok,
157 'timeStmp': timeStmp,
158 'service': 'youtube',
159 'hl': 'en_US',
83317f69 160 }
5f6a1245 161 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
83317f69 162 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
163
164 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
165 tfa_results = self._download_webpage(
166 tfa_req, None,
69ea8ca4 167 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
83317f69 168
169 if tfa_results is False:
170 return False
171
172 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
69ea8ca4 173 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
83317f69 174 return False
175 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
69ea8ca4 176 self._downloader.report_warning('unable to log in - did the page structure change?')
83317f69 177 return False
178 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
69ea8ca4 179 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
83317f69 180 return False
181
7cc3570e 182 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
69ea8ca4 183 self._downloader.report_warning('unable to log in: bad username or password')
b2e8bc1b
JMF
184 return False
185 return True
186
b2e8bc1b
JMF
187 def _real_initialize(self):
188 if self._downloader is None:
189 return
42939b61 190 self._set_language()
b2e8bc1b
JMF
191 if not self._login():
192 return
c5e8d7af 193
8377574c 194
360e1ca5 195class YoutubeIE(YoutubeBaseInfoExtractor):
78caa52a 196 IE_DESC = 'YouTube.com'
cb7dfeea 197 _VALID_URL = r"""(?x)^
c5e8d7af 198 (
edb53e2d 199 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 200 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 201 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 202 (?:www\.)?pwnyoutube\.com/|
f7000f3a 203 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
204 tube\.majestyc\.net/|
205 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
206 (?:.*?\#/)? # handle anchor (#/) redirect urls
207 (?: # the various things that can precede the ID:
ac7553d0 208 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 209 |(?: # or the v= param in all its forms
f7000f3a 210 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
211 (?:\?|\#!?) # the params delimiter ? or # or #!
212 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
213 v=
214 )
f4b05232
JMF
215 ))
216 |youtu\.be/ # just youtu.be/xxxx
edb53e2d 217 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 218 )
c5e8d7af 219 )? # all until now is optional -> you can pass the naked ID
8963d9c2 220 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
9291475f 221 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
c5e8d7af
PH
222 (?(1).+)? # if we found the ID, everything can follow
223 $"""
c5e8d7af 224 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
225 _formats = {
226 '5': {'ext': 'flv', 'width': 400, 'height': 240},
227 '6': {'ext': 'flv', 'width': 450, 'height': 270},
228 '13': {'ext': '3gp'},
229 '17': {'ext': '3gp', 'width': 176, 'height': 144},
230 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
231 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
232 '34': {'ext': 'flv', 'width': 640, 'height': 360},
233 '35': {'ext': 'flv', 'width': 854, 'height': 480},
234 '36': {'ext': '3gp', 'width': 320, 'height': 240},
235 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
236 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
237 '43': {'ext': 'webm', 'width': 640, 'height': 360},
238 '44': {'ext': 'webm', 'width': 854, 'height': 480},
239 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
240 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
c9bebed2
S
241 '59': {'ext': 'mp4', 'width': 854, 'height': 480},
242 '78': {'ext': 'mp4', 'width': 854, 'height': 480},
2c62dc26 243
1d043b93 244
86fe61c8 245 # 3d videos
43b81eb9
PH
246 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
247 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
248 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
249 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
250 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
251 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
252 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 253
96fb5605 254 # Apple HTTP Live Streaming
43b81eb9
PH
255 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
256 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
257 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
258 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
259 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
260 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
261 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
262
263 # DASH mp4 video
43b81eb9
PH
264 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
265 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
266 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
267 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
268 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
e65566a9 269 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
43b81eb9
PH
270 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
271 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
0d2c1418
PH
272 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
273 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
274 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
836a086c 275
f6f1fc92 276 # Dash mp4 audio
62cd676c
PH
277 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
278 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
279 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
836a086c
AZ
280
281 # Dash webm
e75cafe9
A
282 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
283 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
284 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
285 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
286 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
287 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
1cc887cb 288 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
e75cafe9
A
289 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
290 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
291 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
292 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
293 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
294 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
295 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 296 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 297 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
a51d3aa0
PH
298 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
299 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
76b3c610 300 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
10a404c3 301 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
76b3c610 302 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
2c62dc26
PH
303
304 # Dash webm audio
55db73ef 305 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
e75cafe9 306 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d 307
0857baad
PH
308 # Dash webm audio with opus inside
309 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
310 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
311 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
312
ce6b9a2d
PH
313 # RTMP (unnamed)
314 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 315 }
836a086c 316
78caa52a 317 IE_NAME = 'youtube'
2eb88d95
PH
318 _TESTS = [
319 {
4bc3a23e
PH
320 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
321 'info_dict': {
322 'id': 'BaW_jenozKc',
323 'ext': 'mp4',
324 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
325 'uploader': 'Philipp Hagemeister',
326 'uploader_id': 'phihag',
327 'upload_date': '20121002',
328 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
329 'categories': ['Science & Technology'],
3e7c1224
PH
330 'like_count': int,
331 'dislike_count': int,
2eb88d95 332 }
0e853ca4 333 },
0e853ca4 334 {
4bc3a23e
PH
335 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
336 'note': 'Test generic use_cipher_signature video (#897)',
337 'info_dict': {
338 'id': 'UxxajLWwzqY',
339 'ext': 'mp4',
340 'upload_date': '20120506',
341 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
342 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
343 'uploader': 'Icona Pop',
344 'uploader_id': 'IconaPop',
2eb88d95 345 }
c108eb73
JMF
346 },
347 {
4bc3a23e
PH
348 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
349 'note': 'Test VEVO video with age protection (#956)',
350 'info_dict': {
351 'id': '07FYdnEawAQ',
352 'ext': 'mp4',
353 'upload_date': '20130703',
354 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
355 'description': 'md5:64249768eec3bc4276236606ea996373',
356 'uploader': 'justintimberlakeVEVO',
357 'uploader_id': 'justintimberlakeVEVO',
c108eb73
JMF
358 }
359 },
fccd3771 360 {
4bc3a23e
PH
361 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
362 'note': 'Embed-only video (#1746)',
363 'info_dict': {
364 'id': 'yZIXLfi8CZQ',
365 'ext': 'mp4',
366 'upload_date': '20120608',
367 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
368 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
369 'uploader': 'SET India',
370 'uploader_id': 'setindia'
fccd3771
PH
371 }
372 },
dd27fd17 373 {
4bc3a23e
PH
374 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
375 'note': '256k DASH audio (format 141) via DASH manifest',
376 'info_dict': {
377 'id': 'a9LDPn-MO4I',
378 'ext': 'm4a',
379 'upload_date': '20121002',
380 'uploader_id': '8KVIDEO',
381 'description': '',
382 'uploader': '8KVIDEO',
383 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 384 },
4bc3a23e
PH
385 'params': {
386 'youtube_include_dash_manifest': True,
387 'format': '141',
4919603f 388 },
dd27fd17 389 },
3489b7d2
JMF
390 # DASH manifest with encrypted signature
391 {
78caa52a
PH
392 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
393 'info_dict': {
394 'id': 'IB3lcPjvWLA',
395 'ext': 'm4a',
b766eb27
JMF
396 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
397 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
78caa52a
PH
398 'uploader': 'AfrojackVEVO',
399 'uploader_id': 'AfrojackVEVO',
400 'upload_date': '20131011',
3489b7d2 401 },
4bc3a23e 402 'params': {
78caa52a
PH
403 'youtube_include_dash_manifest': True,
404 'format': '141',
3489b7d2
JMF
405 },
406 },
aaeb86f6
S
407 # JS player signature function name containing $
408 {
409 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
410 'info_dict': {
411 'id': 'nfWlot6h_JM',
412 'ext': 'm4a',
413 'title': 'Taylor Swift - Shake It Off',
414 'description': 'md5:2acfda1b285bdd478ccec22f9918199d',
415 'uploader': 'TaylorSwiftVEVO',
416 'uploader_id': 'TaylorSwiftVEVO',
417 'upload_date': '20140818',
418 },
419 'params': {
420 'youtube_include_dash_manifest': True,
421 'format': '141',
422 },
423 },
aa79ac0c
PH
424 # Controversy video
425 {
426 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
427 'info_dict': {
428 'id': 'T4XJQO3qol8',
429 'ext': 'mp4',
430 'upload_date': '20100909',
431 'uploader': 'The Amazing Atheist',
432 'uploader_id': 'TheAmazingAtheist',
433 'title': 'Burning Everyone\'s Koran',
434 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
435 }
c522adb1
JMF
436 },
437 # Normal age-gate video (No vevo, embed allowed)
438 {
439 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
440 'info_dict': {
441 'id': 'HtVdAasjOgU',
442 'ext': 'mp4',
443 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
9ed99402 444 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
c522adb1
JMF
445 'uploader': 'The Witcher',
446 'uploader_id': 'WitcherGame',
447 'upload_date': '20140605',
448 },
449 },
fccae2b9
S
450 # Age-gate video with encrypted signature
451 {
452 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',
453 'info_dict': {
454 'id': '6kLq3WMV1nU',
455 'ext': 'mp4',
456 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
457 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
458 'uploader': 'LloydVEVO',
459 'uploader_id': 'LloydVEVO',
460 'upload_date': '20110629',
461 },
462 },
774e208f
PH
463 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
464 {
465 'url': '__2ABJjxzNo',
466 'info_dict': {
467 'id': '__2ABJjxzNo',
468 'ext': 'mp4',
469 'upload_date': '20100430',
470 'uploader_id': 'deadmau5',
471 'description': 'md5:12c56784b8032162bb936a5f76d55360',
472 'uploader': 'deadmau5',
473 'title': 'Deadmau5 - Some Chords (HD)',
474 },
475 'expected_warnings': [
476 'DASH manifest missing',
477 ]
e52a40ab
PH
478 },
479 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
480 {
481 'url': 'lqQg6PlCWgI',
482 'info_dict': {
483 'id': 'lqQg6PlCWgI',
484 'ext': 'mp4',
cbe2bd91
PH
485 'upload_date': '20120731',
486 'uploader_id': 'olympic',
487 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
488 'uploader': 'Olympics',
489 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
490 },
491 'params': {
492 'skip_download': 'requires avconv',
e52a40ab 493 }
cbe2bd91 494 },
6271f1ca
PH
495 # Non-square pixels
496 {
497 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
498 'info_dict': {
499 'id': '_b-2C3KPAM0',
500 'ext': 'mp4',
501 'stretched_ratio': 16 / 9.,
502 'upload_date': '20110310',
503 'uploader_id': 'AllenMeow',
504 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
505 'uploader': '孫艾倫',
506 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
507 },
06b491eb
S
508 },
509 # url_encoded_fmt_stream_map is empty string
510 {
511 'url': 'qEJwOuvDf7I',
512 'info_dict': {
513 'id': 'qEJwOuvDf7I',
514 'ext': 'mp4',
515 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
516 'description': '',
517 'upload_date': '20150404',
518 'uploader_id': 'spbelect',
519 'uploader': 'Наблюдатели Петербурга',
520 },
521 'params': {
522 'skip_download': 'requires avconv',
523 }
524 },
da77d856
S
525 # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)
526 {
527 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
528 'info_dict': {
529 'id': 'FIl7x6_3R5Y',
530 'ext': 'mp4',
531 'title': 'md5:7b81415841e02ecd4313668cde88737a',
532 'description': 'md5:116377fd2963b81ec4ce64b542173306',
533 'upload_date': '20150625',
534 'uploader_id': 'dorappi2000',
535 'uploader': 'dorappi2000',
536 'formats': 'mincount:33',
537 },
538 }
2eb88d95
PH
539 ]
540
e0df6211
PH
541 def __init__(self, *args, **kwargs):
542 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 543 self._player_cache = {}
e0df6211 544
c5e8d7af
PH
545 def report_video_info_webpage_download(self, video_id):
546 """Report attempt to download video info webpage."""
69ea8ca4 547 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 548
c5e8d7af
PH
549 def report_information_extraction(self, video_id):
550 """Report attempt to extract video information."""
69ea8ca4 551 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
552
553 def report_unavailable_format(self, video_id, format):
554 """Report extracted video URL."""
69ea8ca4 555 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
556
557 def report_rtmp_download(self):
558 """Indicate the download will use the RTMP protocol."""
69ea8ca4 559 self.to_screen('RTMP download detected')
c5e8d7af 560
60064c53
PH
561 def _signature_cache_id(self, example_sig):
562 """ Return a string representation of a signature """
78caa52a 563 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
564
565 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 566 id_m = re.match(
60620368 567 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
cf010131 568 player_url)
c081b35c
PH
569 if not id_m:
570 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
571 player_type = id_m.group('ext')
572 player_id = id_m.group('id')
573
c4417ddb 574 # Read from filesystem cache
60064c53
PH
575 func_id = '%s_%s_%s' % (
576 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 577 assert os.path.basename(func_id) == func_id
a0e07d31 578
69ea8ca4 579 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 580 if cache_spec is not None:
78caa52a 581 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 582
6d1a55a5
PH
583 download_note = (
584 'Downloading player %s' % player_url
585 if self._downloader.params.get('verbose') else
586 'Downloading %s player %s' % (player_type, player_id)
587 )
e0df6211
PH
588 if player_type == 'js':
589 code = self._download_webpage(
590 player_url, video_id,
6d1a55a5 591 note=download_note,
69ea8ca4 592 errnote='Download of %s failed' % player_url)
83799698 593 res = self._parse_sig_js(code)
c4417ddb 594 elif player_type == 'swf':
e0df6211
PH
595 urlh = self._request_webpage(
596 player_url, video_id,
6d1a55a5 597 note=download_note,
69ea8ca4 598 errnote='Download of %s failed' % player_url)
e0df6211 599 code = urlh.read()
83799698 600 res = self._parse_sig_swf(code)
e0df6211
PH
601 else:
602 assert False, 'Invalid player type %r' % player_type
603
785521bf
PH
604 test_string = ''.join(map(compat_chr, range(len(example_sig))))
605 cache_res = res(test_string)
606 cache_spec = [ord(c) for c in cache_res]
83799698 607
69ea8ca4 608 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
609 return res
610
60064c53 611 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
612 def gen_sig_code(idxs):
613 def _genslice(start, end, step):
78caa52a 614 starts = '' if start == 0 else str(start)
8bcc8756 615 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 616 steps = '' if step == 1 else (':%d' % step)
78caa52a 617 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
618
619 step = None
7af808a5
PH
620 # Quelch pyflakes warnings - start will be set when step is set
621 start = '(Never used)'
edf3e38e
PH
622 for i, prev in zip(idxs[1:], idxs[:-1]):
623 if step is not None:
624 if i - prev == step:
625 continue
626 yield _genslice(start, prev, step)
627 step = None
628 continue
629 if i - prev in [-1, 1]:
630 step = i - prev
631 start = prev
632 continue
633 else:
78caa52a 634 yield 's[%d]' % prev
edf3e38e 635 if step is None:
78caa52a 636 yield 's[%d]' % i
edf3e38e
PH
637 else:
638 yield _genslice(start, i, step)
639
78caa52a 640 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 641 cache_res = func(test_string)
edf3e38e 642 cache_spec = [ord(c) for c in cache_res]
78caa52a 643 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
644 signature_id_tuple = '(%s)' % (
645 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 646 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 647 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 648 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 649
e0df6211
PH
650 def _parse_sig_js(self, jscode):
651 funcname = self._search_regex(
aaeb86f6 652 r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode,
8bcc8756 653 'Initial JS player signature function name')
2b25cb5d
PH
654
655 jsi = JSInterpreter(jscode)
656 initial_function = jsi.extract_function(funcname)
e0df6211
PH
657 return lambda s: initial_function([s])
658
659 def _parse_sig_swf(self, file_contents):
54256267 660 swfi = SWFInterpreter(file_contents)
78caa52a 661 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 662 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 663 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
664 return lambda s: initial_function([s])
665
83799698 666 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 667 """Turn the encrypted s field into a working signature"""
6b37f0be 668
c8bf86d5 669 if player_url is None:
69ea8ca4 670 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 671
69ea8ca4 672 if player_url.startswith('//'):
78caa52a 673 player_url = 'https:' + player_url
c8bf86d5 674 try:
62af3a0e 675 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
676 if player_id not in self._player_cache:
677 func = self._extract_signature_function(
60064c53 678 video_id, player_url, s
c8bf86d5
PH
679 )
680 self._player_cache[player_id] = func
681 func = self._player_cache[player_id]
682 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 683 self._print_sig_code(func, s)
c8bf86d5
PH
684 return func(s)
685 except Exception as e:
686 tb = traceback.format_exc()
687 raise ExtractorError(
78caa52a 688 'Signature extraction failed: ' + tb, cause=e)
e0df6211 689
360e1ca5 690 def _get_subtitles(self, video_id, webpage):
de7f3446 691 try:
60e47a26 692 subs_doc = self._download_xml(
38c2e5b8 693 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
694 video_id, note=False)
695 except ExtractorError as err:
69ea8ca4 696 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
de7f3446 697 return {}
de7f3446
JMF
698
699 sub_lang_list = {}
60e47a26
JMF
700 for track in subs_doc.findall('track'):
701 lang = track.attrib['lang_code']
7e660ac1
LD
702 if lang in sub_lang_list:
703 continue
360e1ca5
JMF
704 sub_formats = []
705 for ext in ['sbv', 'vtt', 'srt']:
706 params = compat_urllib_parse.urlencode({
707 'lang': lang,
708 'v': video_id,
709 'fmt': ext,
710 'name': track.attrib['name'].encode('utf-8'),
711 })
712 sub_formats.append({
713 'url': 'https://www.youtube.com/api/timedtext?' + params,
714 'ext': ext,
715 })
716 sub_lang_list[lang] = sub_formats
de7f3446 717 if not sub_lang_list:
69ea8ca4 718 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
719 return {}
720 return sub_lang_list
721
360e1ca5 722 def _get_automatic_captions(self, video_id, webpage):
de7f3446
JMF
723 """We need the webpage for getting the captions url, pass it as an
724 argument to speed up the process."""
69ea8ca4 725 self.to_screen('%s: Looking for automatic captions' % video_id)
de7f3446 726 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
78caa52a 727 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
728 if mobj is None:
729 self._downloader.report_warning(err_msg)
730 return {}
731 player_config = json.loads(mobj.group(1))
732 try:
0792d563
PH
733 args = player_config['args']
734 caption_url = args['ttsurl']
735 timestamp = args['timestamp']
055e6f36
JMF
736 # We get the available subtitles
737 list_params = compat_urllib_parse.urlencode({
738 'type': 'list',
739 'tlangs': 1,
740 'asrs': 1,
de7f3446 741 })
055e6f36 742 list_url = caption_url + '&' + list_params
e26f8712 743 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 744 original_lang_node = caption_list.find('track')
7d900ef1 745 if original_lang_node is None:
69ea8ca4 746 self._downloader.report_warning('Video doesn\'t have automatic captions')
e3dc22ca
JMF
747 return {}
748 original_lang = original_lang_node.attrib['lang_code']
7d900ef1 749 caption_kind = original_lang_node.attrib.get('kind', '')
055e6f36
JMF
750
751 sub_lang_list = {}
752 for lang_node in caption_list.findall('target'):
753 sub_lang = lang_node.attrib['lang_code']
360e1ca5
JMF
754 sub_formats = []
755 for ext in ['sbv', 'vtt', 'srt']:
756 params = compat_urllib_parse.urlencode({
757 'lang': original_lang,
758 'tlang': sub_lang,
759 'fmt': ext,
760 'ts': timestamp,
761 'kind': caption_kind,
762 })
763 sub_formats.append({
764 'url': caption_url + '&' + params,
765 'ext': ext,
766 })
767 sub_lang_list[sub_lang] = sub_formats
055e6f36 768 return sub_lang_list
de7f3446
JMF
769 # An extractor error can be raise by the download process if there are
770 # no automatic captions but there are subtitles
771 except (KeyError, ExtractorError):
772 self._downloader.report_warning(err_msg)
773 return {}
774
97665381
PH
775 @classmethod
776 def extract_id(cls, url):
777 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 778 if mobj is None:
69ea8ca4 779 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
780 video_id = mobj.group(2)
781 return video_id
782
1d043b93
JMF
783 def _extract_from_m3u8(self, manifest_url, video_id):
784 url_map = {}
5f6a1245 785
1d043b93
JMF
786 def _get_urls(_manifest):
787 lines = _manifest.split('\n')
788 urls = filter(lambda l: l and not l.startswith('#'),
8bcc8756 789 lines)
1d043b93 790 return urls
78caa52a 791 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
1d043b93
JMF
792 formats_urls = _get_urls(manifest)
793 for format_url in formats_urls:
890f62e8 794 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
795 url_map[itag] = format_url
796 return url_map
797
1fb07d10
JG
798 def _extract_annotations(self, video_id):
799 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 800 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 801
da276600 802 def _parse_dash_manifest(
77c6fb5b 803 self, video_id, dash_manifest_url, player_url, age_gate, fatal=True):
774e208f
PH
804 def decrypt_sig(mobj):
805 s = mobj.group(1)
806 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
807 return '/signature/%s' % dec_s
e1b9322b 808 dash_manifest_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, dash_manifest_url)
774e208f
PH
809 dash_doc = self._download_xml(
810 dash_manifest_url, video_id,
811 note='Downloading DASH manifest',
77c6fb5b
S
812 errnote='Could not download DASH manifest',
813 fatal=fatal)
814
815 if dash_doc is False:
816 return []
774e208f
PH
817
818 formats = []
de5c5456
YCH
819 for a in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}AdaptationSet'):
820 mime_type = a.attrib.get('mimeType')
821 for r in a.findall('{urn:mpeg:DASH:schema:MPD:2011}Representation'):
822 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
823 if url_el is None:
824 continue
825 if mime_type == 'text/vtt':
826 # TODO implement WebVTT downloading
827 pass
828 elif mime_type.startswith('audio/') or mime_type.startswith('video/'):
829 format_id = r.attrib['id']
830 video_url = url_el.text
831 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
832 f = {
833 'format_id': format_id,
834 'url': video_url,
835 'width': int_or_none(r.attrib.get('width')),
836 'height': int_or_none(r.attrib.get('height')),
837 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
838 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
839 'filesize': filesize,
840 'fps': int_or_none(r.attrib.get('frameRate')),
841 }
842 try:
843 existing_format = next(
844 fo for fo in formats
845 if fo['format_id'] == format_id)
846 except StopIteration:
847 full_info = self._formats.get(format_id, {}).copy()
848 full_info.update(f)
1b5a1ae2
S
849 codecs = r.attrib.get('codecs')
850 if codecs:
851 if full_info.get('acodec') == 'none' and 'vcodec' not in full_info:
852 full_info['vcodec'] = codecs
853 elif full_info.get('vcodec') == 'none' and 'acodec' not in full_info:
854 full_info['acodec'] = codecs
de5c5456
YCH
855 formats.append(full_info)
856 else:
857 existing_format.update(f)
858 else:
859 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
774e208f
PH
860 return formats
861
c5e8d7af 862 def _real_extract(self, url):
7e8c0af0 863 proto = (
78caa52a
PH
864 'http' if self._downloader.params.get('prefer_insecure', False)
865 else 'https')
7e8c0af0 866
c5e8d7af
PH
867 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
868 mobj = re.search(self._NEXT_URL_RE, url)
869 if mobj:
7fd002c0 870 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
97665381 871 video_id = self.extract_id(url)
c5e8d7af
PH
872
873 # Get video webpage
aa79ac0c 874 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
a1f934b1 875 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
876
877 # Attempt to extract SWF player URL
e0df6211 878 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
879 if mobj is not None:
880 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
881 else:
882 player_url = None
883
d8d24a92
S
884 dash_mpds = []
885
886 def add_dash_mpd(video_info):
887 dash_mpd = video_info.get('dashmpd')
888 if dash_mpd and dash_mpd[0] not in dash_mpds:
889 dash_mpds.append(dash_mpd[0])
890
c5e8d7af 891 # Get video info
6449cd80 892 embed_webpage = None
c108eb73 893 if re.search(r'player-age-gate-content">', video_webpage) is not None:
c108eb73
JMF
894 age_gate = True
895 # We simulate the access to the video from www.youtube.com/v/{video_id}
896 # this can be viewed without login into Youtube
beb95e77
CL
897 url = proto + '://www.youtube.com/embed/%s' % video_id
898 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
2c57c7fa
JMF
899 data = compat_urllib_parse.urlencode({
900 'video_id': video_id,
901 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934 902 'sts': self._search_regex(
beb95e77 903 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
2c57c7fa 904 })
7e8c0af0 905 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
94bd3613
PH
906 video_info_webpage = self._download_webpage(
907 video_info_url, video_id,
20436c30 908 note='Refetching age-gated info webpage',
94bd3613 909 errnote='unable to download video info webpage')
c5e8d7af 910 video_info = compat_parse_qs(video_info_webpage)
d8d24a92 911 add_dash_mpd(video_info)
c108eb73
JMF
912 else:
913 age_gate = False
bc93bdb5 914 video_info = None
d8d24a92
S
915 # Try looking directly into the video webpage
916 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
917 if mobj:
4e62ebe2
JMF
918 json_code = uppercase_escape(mobj.group(1))
919 ytplayer_config = json.loads(json_code)
920 args = ytplayer_config['args']
d8d24a92
S
921 if args.get('url_encoded_fmt_stream_map'):
922 # Convert to the same format returned by compat_parse_qs
923 video_info = dict((k, [v]) for k, v in args.items())
924 add_dash_mpd(video_info)
0a3cf9ad
S
925 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
926 # We also try looking in get_video_info since it may contain different dashmpd
927 # URL that points to a DASH manifest with possibly different itag set (some itags
928 # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
929 # manifest pointed by get_video_info's dashmpd).
930 # The general idea is to take a union of itags of both DASH manifests (for example
931 # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
4e62ebe2 932 self.report_video_info_webpage_download(video_id)
0a3cf9ad 933 for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:
810fb84d
PH
934 video_info_url = (
935 '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
936 % (proto, video_id, el_type))
937 video_info_webpage = self._download_webpage(
938 video_info_url,
4e62ebe2
JMF
939 video_id, note=False,
940 errnote='unable to download video info webpage')
0a3cf9ad
S
941 get_video_info = compat_parse_qs(video_info_webpage)
942 add_dash_mpd(get_video_info)
943 if not video_info:
944 video_info = get_video_info
945 if 'token' in get_video_info:
4e62ebe2 946 break
c5e8d7af
PH
947 if 'token' not in video_info:
948 if 'reason' in video_info:
af214c3a
YCH
949 if 'The uploader has not made this video available in your country.' in video_info['reason']:
950 regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None)
951 if regions_allowed is not None:
952 raise ExtractorError('YouTube said: This video is available in %s only' % (
953 ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))),
954 expected=True)
d11271dd 955 raise ExtractorError(
78caa52a 956 'YouTube said: %s' % video_info['reason'][0],
d11271dd 957 expected=True, video_id=video_id)
c5e8d7af 958 else:
d11271dd 959 raise ExtractorError(
78caa52a 960 '"token" parameter not in video info for unknown reason',
d11271dd 961 video_id=video_id)
c5e8d7af 962
1d699755
PH
963 if 'view_count' in video_info:
964 view_count = int(video_info['view_count'][0])
965 else:
966 view_count = None
967
c5e8d7af
PH
968 # Check for "rental" videos
969 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
69ea8ca4 970 raise ExtractorError('"rental" videos not supported')
c5e8d7af
PH
971
972 # Start extracting information
973 self.report_information_extraction(video_id)
974
975 # uploader
976 if 'author' not in video_info:
69ea8ca4 977 raise ExtractorError('Unable to extract uploader name')
7fd002c0 978 video_uploader = compat_urllib_parse_unquote_plus(video_info['author'][0])
c5e8d7af
PH
979
980 # uploader_id
981 video_uploader_id = None
982 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
983 if mobj is not None:
984 video_uploader_id = mobj.group(1)
985 else:
69ea8ca4 986 self._downloader.report_warning('unable to extract uploader nickname')
c5e8d7af
PH
987
988 # title
a8c6b241 989 if 'title' in video_info:
aa92f063 990 video_title = video_info['title'][0]
a8c6b241 991 else:
69ea8ca4 992 self._downloader.report_warning('Unable to extract video title')
78caa52a 993 video_title = '_'
c5e8d7af
PH
994
995 # thumbnail image
7763b04e
JMF
996 # We try first to get a high quality image:
997 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
998 video_webpage, re.DOTALL)
999 if m_thumb is not None:
1000 video_thumbnail = m_thumb.group(1)
1001 elif 'thumbnail_url' not in video_info:
69ea8ca4 1002 self._downloader.report_warning('unable to extract video thumbnail')
f490e77e 1003 video_thumbnail = None
c5e8d7af 1004 else: # don't panic if we can't find it
7fd002c0 1005 video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
c5e8d7af
PH
1006
1007 # upload date
9d0b581f
S
1008 upload_date = self._html_search_meta(
1009 'datePublished', video_webpage, 'upload date', default=None)
1010 if not upload_date:
1011 upload_date = self._search_regex(
1012 [r'(?s)id="eow-date.*?>(.*?)</span>',
1013 r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'],
1014 video_webpage, 'upload date', default=None)
1015 if upload_date:
1016 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1017 upload_date = unified_strdate(upload_date)
c5e8d7af 1018
55f7bd2d
PH
1019 m_cat_container = self._search_regex(
1020 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
624dcebf 1021 video_webpage, 'categories', default=None)
ec8deefc 1022 if m_cat_container:
ad3bc6ac 1023 category = self._html_search_regex(
01ed5c9b 1024 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
1025 default=None)
1026 video_categories = None if category is None else [category]
1027 else:
1028 video_categories = None
ec8deefc 1029
c5e8d7af
PH
1030 # description
1031 video_description = get_element_by_id("eow-description", video_webpage)
1032 if video_description:
27dcce19
PH
1033 video_description = re.sub(r'''(?x)
1034 <a\s+
1035 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1036 title="([^"]+)"\s+
1037 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1038 class="yt-uix-redirect-link"\s*>
1039 [^<]+
1040 </a>
1041 ''', r'\1', video_description)
c5e8d7af
PH
1042 video_description = clean_html(video_description)
1043 else:
1044 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1045 if fd_mobj:
1046 video_description = unescapeHTML(fd_mobj.group(1))
1047 else:
78caa52a 1048 video_description = ''
c5e8d7af 1049
f30a38be 1050 def _extract_count(count_name):
c93d53f5
S
1051 return str_to_int(self._search_regex(
1052 r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
1053 % re.escape(count_name),
1054 video_webpage, count_name, default=None))
1055
69ea8ca4
PH
1056 like_count = _extract_count('like')
1057 dislike_count = _extract_count('dislike')
336c3a69 1058
c5e8d7af 1059 # subtitles
d82134c3 1060 video_subtitles = self.extract_subtitles(video_id, video_webpage)
360e1ca5 1061 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
c5e8d7af
PH
1062
1063 if 'length_seconds' not in video_info:
69ea8ca4 1064 self._downloader.report_warning('unable to extract video duration')
b466b702 1065 video_duration = None
c5e8d7af 1066 else:
7fd002c0 1067 video_duration = int(compat_urllib_parse_unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1068
1fb07d10
JG
1069 # annotations
1070 video_annotations = None
1071 if self._downloader.params.get('writeannotations', False):
5f6a1245 1072 video_annotations = self._extract_annotations(video_id)
1fb07d10 1073
dd27fd17
PH
1074 def _map_to_format_list(urlmap):
1075 formats = []
1076 for itag, video_real_url in urlmap.items():
1077 dct = {
1078 'format_id': itag,
1079 'url': video_real_url,
1080 'player_url': player_url,
1081 }
0b65e5d4
PH
1082 if itag in self._formats:
1083 dct.update(self._formats[itag])
dd27fd17
PH
1084 formats.append(dct)
1085 return formats
1086
c5e8d7af
PH
1087 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1088 self.report_rtmp_download()
dd27fd17
PH
1089 formats = [{
1090 'format_id': '_rtmp',
1091 'protocol': 'rtmp',
1092 'url': video_info['conn'][0],
1093 'player_url': player_url,
1094 }]
24270b03 1095 elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
5f6a1245 1096 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 1097 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1098 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1099 url_map = {}
00fe14fc 1100 for url_data_str in encoded_url_map.split(','):
c5e8d7af 1101 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
1102 if 'itag' not in url_data or 'url' not in url_data:
1103 continue
1104 format_id = url_data['itag'][0]
1105 url = url_data['url'][0]
1106
1107 if 'sig' in url_data:
1108 url += '&signature=' + url_data['sig'][0]
1109 elif 's' in url_data:
1110 encrypted_sig = url_data['s'][0]
6449cd80 1111 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
201e9eaa 1112
beb95e77 1113 jsplayer_url_json = self._search_regex(
6449cd80
PH
1114 ASSETS_RE,
1115 embed_webpage if age_gate else video_webpage,
1116 'JS player URL (1)', default=None)
1117 if not jsplayer_url_json and not age_gate:
1118 # We need the embed website after all
1119 if embed_webpage is None:
1120 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1121 embed_webpage = self._download_webpage(
1122 embed_url, video_id, 'Downloading embed webpage')
1123 jsplayer_url_json = self._search_regex(
1124 ASSETS_RE, embed_webpage, 'JS player URL')
1125
beb95e77 1126 player_url = json.loads(jsplayer_url_json)
201e9eaa
PH
1127 if player_url is None:
1128 player_url_json = self._search_regex(
1129 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 1130 video_webpage, 'age gate player URL')
201e9eaa
PH
1131 player_url = json.loads(player_url_json)
1132
1133 if self._downloader.params.get('verbose'):
cf010131 1134 if player_url is None:
201e9eaa
PH
1135 player_version = 'unknown'
1136 player_desc = 'unknown'
1137 else:
1138 if player_url.endswith('swf'):
1139 player_version = self._search_regex(
1140 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 1141 'flash player', fatal=False)
201e9eaa 1142 player_desc = 'flash player %s' % player_version
cf010131 1143 else:
201e9eaa
PH
1144 player_version = self._search_regex(
1145 r'html5player-([^/]+?)(?:/html5player)?\.js',
1146 player_url,
1147 'html5 player', fatal=False)
78caa52a 1148 player_desc = 'html5 player %s' % player_version
201e9eaa 1149
60064c53 1150 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 1151 self.to_screen('{%s} signature length %s, %s' %
9e1a5b84 1152 (format_id, parts_sizes, player_desc))
201e9eaa
PH
1153
1154 signature = self._decrypt_signature(
1155 encrypted_sig, video_id, player_url, age_gate)
1156 url += '&signature=' + signature
1157 if 'ratebypass' not in url:
1158 url += '&ratebypass=yes'
1159 url_map[format_id] = url
dd27fd17 1160 formats = _map_to_format_list(url_map)
1d043b93
JMF
1161 elif video_info.get('hlsvp'):
1162 manifest_url = video_info['hlsvp'][0]
1163 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1164 formats = _map_to_format_list(url_map)
c5e8d7af 1165 else:
69ea8ca4 1166 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1167
dd27fd17 1168 # Look for the DASH manifest
203fb43f 1169 if self._downloader.params.get('youtube_include_dash_manifest', True):
77c6fb5b 1170 dash_mpd_fatal = True
d8d24a92
S
1171 for dash_manifest_url in dash_mpds:
1172 dash_formats = {}
774e208f 1173 try:
d8d24a92 1174 for df in self._parse_dash_manifest(
77c6fb5b 1175 video_id, dash_manifest_url, player_url, age_gate, dash_mpd_fatal):
d8d24a92
S
1176 # Do not overwrite DASH format found in some previous DASH manifest
1177 if df['format_id'] not in dash_formats:
1178 dash_formats[df['format_id']] = df
77c6fb5b
S
1179 # Additional DASH manifests may end up in HTTP Error 403 therefore
1180 # allow them to fail without bug report message if we already have
1181 # some DASH manifest succeeded. This is temporary workaround to reduce
1182 # burst of bug reports until we figure out the reason and whether it
1183 # can be fixed at all.
1184 dash_mpd_fatal = False
774e208f
PH
1185 except (ExtractorError, KeyError) as e:
1186 self.report_warning(
1187 'Skipping DASH manifest: %r' % e, video_id)
d8d24a92 1188 if dash_formats:
04b3b3df
JMF
1189 # Remove the formats we found through non-DASH, they
1190 # contain less info and it can be wrong, because we use
1191 # fixed values (for example the resolution). See
1192 # https://github.com/rg3/youtube-dl/issues/5774 for an
1193 # example.
d80265cc 1194 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
d8d24a92 1195 formats.extend(dash_formats.values())
d80044c2 1196
6271f1ca
PH
1197 # Check for malformed aspect ratio
1198 stretched_m = re.search(
1199 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
1200 video_webpage)
1201 if stretched_m:
1202 ratio = float(stretched_m.group('w')) / float(stretched_m.group('h'))
1203 for f in formats:
1204 if f.get('vcodec') != 'none':
1205 f['stretched_ratio'] = ratio
1206
4bcc7bd1 1207 self._sort_formats(formats)
4ea3be0a 1208
1209 return {
8bcc8756
JW
1210 'id': video_id,
1211 'uploader': video_uploader,
1212 'uploader_id': video_uploader_id,
1213 'upload_date': upload_date,
1214 'title': video_title,
1215 'thumbnail': video_thumbnail,
1216 'description': video_description,
1217 'categories': video_categories,
1218 'subtitles': video_subtitles,
360e1ca5 1219 'automatic_captions': automatic_captions,
8bcc8756
JW
1220 'duration': video_duration,
1221 'age_limit': 18 if age_gate else 0,
1222 'annotations': video_annotations,
7e8c0af0 1223 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 1224 'view_count': view_count,
4ea3be0a 1225 'like_count': like_count,
1226 'dislike_count': dislike_count,
2d30521a 1227 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
8bcc8756 1228 'formats': formats,
4ea3be0a 1229 }
c5e8d7af 1230
5f6a1245 1231
880e1c52 1232class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
78caa52a 1233 IE_DESC = 'YouTube.com playlists'
d67cc9fa 1234 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1235 (?:https?://)?
1236 (?:\w+\.)?
1237 youtube\.com/
1238 (?:
ac7553d0 1239 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
c5e8d7af
PH
1240 \? (?:.*?&)*? (?:p|a|list)=
1241 | p/
1242 )
d67cc9fa 1243 (
99209c29 1244 (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,}
5f6a1245 1245 # Top tracks, they can also include dots
d67cc9fa
JMF
1246 |(?:MC)[\w\.]*
1247 )
c5e8d7af
PH
1248 .*
1249 |
99209c29 1250 ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})
c5e8d7af 1251 )"""
dbb94fb0 1252 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dbb94fb0 1253 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
78caa52a 1254 IE_NAME = 'youtube:playlist'
81127aa5
PH
1255 _TESTS = [{
1256 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1257 'info_dict': {
1258 'title': 'ytdl test PL',
a1cf99d0 1259 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
81127aa5
PH
1260 },
1261 'playlist_count': 3,
9291475f
PH
1262 }, {
1263 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1264 'info_dict': {
acf757f4 1265 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
9291475f
PH
1266 'title': 'YDL_Empty_List',
1267 },
1268 'playlist_count': 0,
1269 }, {
1270 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1271 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1272 'info_dict': {
1273 'title': '29C3: Not my department',
acf757f4 1274 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
9291475f
PH
1275 },
1276 'playlist_count': 95,
1277 }, {
1278 'note': 'issue #673',
1279 'url': 'PLBB231211A4F62143',
1280 'info_dict': {
f46a8702 1281 'title': '[OLD]Team Fortress 2 (Class-based LP)',
acf757f4 1282 'id': 'PLBB231211A4F62143',
9291475f
PH
1283 },
1284 'playlist_mincount': 26,
1285 }, {
1286 'note': 'Large playlist',
1287 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1288 'info_dict': {
1289 'title': 'Uploads from Cauchemar',
acf757f4 1290 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
9291475f
PH
1291 },
1292 'playlist_mincount': 799,
1293 }, {
1294 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1295 'info_dict': {
1296 'title': 'YDL_safe_search',
acf757f4 1297 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
9291475f
PH
1298 },
1299 'playlist_count': 2,
ac7553d0
PH
1300 }, {
1301 'note': 'embedded',
1302 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1303 'playlist_count': 4,
1304 'info_dict': {
1305 'title': 'JODA15',
acf757f4 1306 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
ac7553d0 1307 }
6b08cdf6
PH
1308 }, {
1309 'note': 'Embedded SWF player',
1310 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1311 'playlist_count': 4,
1312 'info_dict': {
1313 'title': 'JODA7',
acf757f4 1314 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
6b08cdf6 1315 }
4b7df0d3
JMF
1316 }, {
1317 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
1318 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
1319 'info_dict': {
acf757f4
PH
1320 'title': 'Uploads from Interstellar Movie',
1321 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
4b7df0d3
JMF
1322 },
1323 'playlist_mincout': 21,
81127aa5 1324 }]
c5e8d7af 1325
880e1c52
JMF
1326 def _real_initialize(self):
1327 self._login()
1328
652cdaa2 1329 def _extract_mix(self, playlist_id):
99209c29 1330 # The mixes are generated from a single video
652cdaa2 1331 # the id of the playlist is just 'RD' + video_id
7d4afc55 1332 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
c9cc0bf5 1333 webpage = self._download_webpage(
78caa52a 1334 url, playlist_id, 'Downloading Youtube mix')
bc2f773b 1335 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
1336 title_span = (
1337 search_title('playlist-title') or
1338 search_title('title long-title') or
1339 search_title('title'))
76d1700b 1340 title = clean_html(title_span)
c9cc0bf5
PH
1341 ids = orderedSet(re.findall(
1342 r'''(?xs)data-video-username=".*?".*?
1343 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1344 webpage))
652cdaa2
JMF
1345 url_results = self._ids_to_results(ids)
1346
1347 return self.playlist_result(url_results, playlist_id, title)
1348
448830ce 1349 def _extract_playlist(self, playlist_id):
dbb94fb0
S
1350 url = self._TEMPLATE_URL % playlist_id
1351 page = self._download_webpage(url, playlist_id)
dbb94fb0 1352
39b62db1
YCH
1353 for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page):
1354 match = match.strip()
1355 # Check if the playlist exists or is private
1356 if re.match(r'[^<]*(The|This) playlist (does not exist|is private)[^<]*', match):
1357 raise ExtractorError(
1358 'The playlist doesn\'t exist or is private, use --username or '
1359 '--netrc to access it.',
1360 expected=True)
1361 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
1362 raise ExtractorError(
1363 'Invalid parameters. Maybe URL is incorrect.',
1364 expected=True)
1365 elif re.match(r'[^<]*Choose your language[^<]*', match):
1366 continue
1367 else:
1368 self.report_warning('Youtube gives an alert message: ' + match)
10c0e2d8 1369
dcbb4580 1370 # Extract the video ids from the playlist pages
70219b0f
JMF
1371 def _entries():
1372 more_widget_html = content_html = page
1373 for page_num in itertools.count(1):
1374 matches = re.finditer(self._VIDEO_RE, content_html)
1375 # We remove the duplicates and the link with index 0
1376 # (it's not the first video of the playlist)
1377 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1378 for vid_id in new_ids:
1379 yield self.url_result(vid_id, 'Youtube', video_id=vid_id)
1380
1381 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1382 if not mobj:
1383 break
1384
1385 more = self._download_json(
1386 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1387 'Downloading page #%s' % page_num,
1388 transform_source=uppercase_escape)
1389 content_html = more['content_html']
1390 if not content_html.strip():
1391 # Some webpages show a "Load more" button but they don't
1392 # have more videos
1393 break
1394 more_widget_html = more['load_more_widget_html']
dbb94fb0
S
1395
1396 playlist_title = self._html_search_regex(
68eb8e90 1397 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
78caa52a 1398 page, 'title')
c5e8d7af 1399
70219b0f 1400 return self.playlist_result(_entries(), playlist_id, playlist_title)
c5e8d7af 1401
448830ce
S
1402 def _real_extract(self, url):
1403 # Extract playlist id
1404 mobj = re.match(self._VALID_URL, url)
1405 if mobj is None:
1406 raise ExtractorError('Invalid URL: %s' % url)
1407 playlist_id = mobj.group(1) or mobj.group(2)
1408
1409 # Check if it's a video-specific URL
1410 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1411 if 'v' in query_dict:
1412 video_id = query_dict['v'][0]
1413 if self._downloader.params.get('noplaylist'):
1414 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1415 return self.url_result(video_id, 'Youtube', video_id=video_id)
1416 else:
1417 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1418
1419 if playlist_id.startswith('RD') or playlist_id.startswith('UL'):
1420 # Mixes require a custom extraction process
1421 return self._extract_mix(playlist_id)
1422
1423 return self._extract_playlist(playlist_id)
1424
c5e8d7af
PH
1425
1426class YoutubeChannelIE(InfoExtractor):
78caa52a 1427 IE_DESC = 'YouTube.com channels'
9ff67727 1428 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
eb0f3e7e 1429 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
78caa52a 1430 IE_NAME = 'youtube:channel'
cdc628a4
PH
1431 _TESTS = [{
1432 'note': 'paginated channel',
1433 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1434 'playlist_mincount': 91,
acf757f4
PH
1435 'info_dict': {
1436 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
1437 }
cdc628a4 1438 }]
c5e8d7af 1439
6de5dbaf
S
1440 @staticmethod
1441 def extract_videos_from_page(page):
c5e8d7af 1442 ids_in_page = []
fb69240c
S
1443 titles_in_page = []
1444 for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page):
1445 video_id = mobj.group('id')
1446 video_title = unescapeHTML(mobj.group('title'))
1447 try:
1448 idx = ids_in_page.index(video_id)
1449 if video_title and not titles_in_page[idx]:
1450 titles_in_page[idx] = video_title
1451 except ValueError:
1452 ids_in_page.append(video_id)
1453 titles_in_page.append(video_title)
1454 return zip(ids_in_page, titles_in_page)
c5e8d7af
PH
1455
1456 def _real_extract(self, url):
9ff67727 1457 channel_id = self._match_id(url)
c5e8d7af 1458
eb0f3e7e 1459 url = self._TEMPLATE_URL % channel_id
386bdfa6
S
1460
1461 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
1462 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
1463 # otherwise fallback on channel by page extraction
1464 channel_page = self._download_webpage(
1465 url + '?view=57', channel_id,
1466 'Downloading channel page', fatal=False)
3d8e9573
S
1467 channel_playlist_id = self._html_search_meta(
1468 'channelId', channel_page, 'channel id', default=None)
1469 if not channel_playlist_id:
1470 channel_playlist_id = self._search_regex(
1471 r'data-channel-external-id="([^"]+)"',
1472 channel_page, 'channel id', default=None)
386bdfa6
S
1473 if channel_playlist_id and channel_playlist_id.startswith('UC'):
1474 playlist_id = 'UU' + channel_playlist_id[2:]
d2a9de78
IK
1475 return self.url_result(
1476 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
386bdfa6 1477
60bf45c8 1478 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
31812a9e
PH
1479 autogenerated = re.search(r'''(?x)
1480 class="[^"]*?(?:
1481 channel-header-autogenerated-label|
1482 yt-channel-title-autogenerated
1483 )[^"]*"''', channel_page) is not None
c5e8d7af 1484
b9643eed
JMF
1485 if autogenerated:
1486 # The videos are contained in a single page
1487 # the ajax pages can't be used, they are empty
b82f815f 1488 entries = [
fb69240c
S
1489 self.url_result(
1490 video_id, 'Youtube', video_id=video_id,
1491 video_title=video_title)
8f02ad4f 1492 for video_id, video_title in self.extract_videos_from_page(channel_page)]
b82f815f
PH
1493 return self.playlist_result(entries, channel_id)
1494
1495 def _entries():
23d3608c 1496 more_widget_html = content_html = channel_page
b9643eed 1497 for pagenum in itertools.count(1):
81c2f20b 1498
8f02ad4f 1499 for video_id, video_title in self.extract_videos_from_page(content_html):
b82f815f 1500 yield self.url_result(
fb69240c
S
1501 video_id, 'Youtube', video_id=video_id,
1502 video_title=video_title)
5f6a1245 1503
23d3608c
JMF
1504 mobj = re.search(
1505 r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
1506 more_widget_html)
1507 if not mobj:
b9643eed 1508 break
c5e8d7af 1509
23d3608c
JMF
1510 more = self._download_json(
1511 'https://youtube.com/%s' % mobj.group('more'), channel_id,
1512 'Downloading page #%s' % (pagenum + 1),
1513 transform_source=uppercase_escape)
1514 content_html = more['content_html']
1515 more_widget_html = more['load_more_widget_html']
1516
b82f815f 1517 return self.playlist_result(_entries(), channel_id)
c5e8d7af
PH
1518
1519
eb0f3e7e 1520class YoutubeUserIE(YoutubeChannelIE):
78caa52a 1521 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
9ff67727 1522 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
eb0f3e7e 1523 _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos'
78caa52a 1524 IE_NAME = 'youtube:user'
c5e8d7af 1525
cdc628a4
PH
1526 _TESTS = [{
1527 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1528 'playlist_mincount': 320,
1529 'info_dict': {
1530 'title': 'TheLinuxFoundation',
1531 }
1532 }, {
1533 'url': 'ytuser:phihag',
1534 'only_matching': True,
1535 }]
1536
e3ea4790 1537 @classmethod
f4b05232 1538 def suitable(cls, url):
e3ea4790
JMF
1539 # Don't return True if the url can be extracted with other youtube
1540 # extractor, the regex would is too permissive and it would match.
1541 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
5f6a1245
JW
1542 if any(ie.suitable(url) for ie in other_ies):
1543 return False
1544 else:
1545 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 1546
b05654f0 1547
b4c08069 1548class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
78caa52a 1549 IE_DESC = 'YouTube.com searches'
b4c08069
JMF
1550 # there doesn't appear to be a real limit, for example if you search for
1551 # 'python' you get more than 8.000.000 results
1552 _MAX_RESULTS = float('inf')
78caa52a 1553 IE_NAME = 'youtube:search'
b05654f0 1554 _SEARCH_KEY = 'ytsearch'
b4c08069 1555 _EXTRA_QUERY_ARGS = {}
9dd8e46a 1556 _TESTS = []
b05654f0 1557
b05654f0
PH
1558 def _get_n_results(self, query, n):
1559 """Get a specified number of results for a query"""
1560
b4c08069 1561 videos = []
b05654f0
PH
1562 limit = n
1563
b4c08069
JMF
1564 for pagenum in itertools.count(1):
1565 url_query = {
02175a79 1566 'search_query': query.encode('utf-8'),
b4c08069
JMF
1567 'page': pagenum,
1568 'spf': 'navigate',
1569 }
1570 url_query.update(self._EXTRA_QUERY_ARGS)
1571 result_url = 'https://www.youtube.com/results?' + compat_urllib_parse.urlencode(url_query)
1572 data = self._download_json(
69ea8ca4 1573 result_url, video_id='query "%s"' % query,
b4c08069 1574 note='Downloading page %s' % pagenum,
69ea8ca4 1575 errnote='Unable to download API page')
b4c08069 1576 html_content = data[1]['body']['content']
7cc3570e 1577
b4c08069 1578 if 'class="search-message' in html_content:
07ad22b8 1579 raise ExtractorError(
78caa52a 1580 '[youtube] No video results', expected=True)
b05654f0 1581
b4c08069
JMF
1582 new_videos = self._ids_to_results(orderedSet(re.findall(
1583 r'href="/watch\?v=(.{11})', html_content)))
1584 videos += new_videos
1585 if not new_videos or len(videos) > limit:
1586 break
b05654f0 1587
b4c08069
JMF
1588 if len(videos) > n:
1589 videos = videos[:n]
b05654f0 1590 return self.playlist_result(videos, query)
75dff0ee 1591
c9ae7b95 1592
a3dd9248 1593class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1594 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248 1595 _SEARCH_KEY = 'ytsearchdate'
78caa52a 1596 IE_DESC = 'YouTube.com searches, newest videos first'
b4c08069 1597 _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
75dff0ee 1598
c9ae7b95
PH
1599
1600class YoutubeSearchURLIE(InfoExtractor):
78caa52a
PH
1601 IE_DESC = 'YouTube.com search URLs'
1602 IE_NAME = 'youtube:search_url'
c9ae7b95 1603 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
1604 _TESTS = [{
1605 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1606 'playlist_mincount': 5,
1607 'info_dict': {
1608 'title': 'youtube-dl test video',
1609 }
1610 }]
c9ae7b95
PH
1611
1612 def _real_extract(self, url):
1613 mobj = re.match(self._VALID_URL, url)
7fd002c0 1614 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
c9ae7b95
PH
1615
1616 webpage = self._download_webpage(url, query)
1617 result_code = self._search_regex(
98998cde 1618 r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML')
c9ae7b95
PH
1619
1620 part_codes = re.findall(
1621 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1622 entries = []
1623 for part_code in part_codes:
1624 part_title = self._html_search_regex(
6feb2d5e 1625 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1626 part_url_snippet = self._html_search_regex(
1627 r'(?s)href="([^"]+)"', part_code, 'item URL')
1628 part_url = compat_urlparse.urljoin(
1629 'https://www.youtube.com/', part_url_snippet)
1630 entries.append({
1631 '_type': 'url',
1632 'url': part_url,
1633 'title': part_title,
1634 })
1635
1636 return {
1637 '_type': 'playlist',
1638 'entries': entries,
1639 'title': query,
1640 }
1641
1642
75dff0ee 1643class YoutubeShowIE(InfoExtractor):
78caa52a 1644 IE_DESC = 'YouTube.com (multi-season) shows'
cdc628a4 1645 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 1646 IE_NAME = 'youtube:show'
cdc628a4
PH
1647 _TESTS = [{
1648 'url': 'http://www.youtube.com/show/airdisasters',
1649 'playlist_mincount': 3,
1650 'info_dict': {
1651 'id': 'airdisasters',
1652 'title': 'Air Disasters',
1653 }
1654 }]
75dff0ee
JMF
1655
1656 def _real_extract(self, url):
1657 mobj = re.match(self._VALID_URL, url)
cdc628a4
PH
1658 playlist_id = mobj.group('id')
1659 webpage = self._download_webpage(
1660 url, playlist_id, 'Downloading show webpage')
75dff0ee
JMF
1661 # There's one playlist for each season of the show
1662 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
cdc628a4
PH
1663 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1664 entries = [
1665 self.url_result(
1666 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1667 for season in m_seasons
1668 ]
1669 title = self._og_search_title(webpage, fatal=False)
1670
1671 return {
1672 '_type': 'playlist',
1673 'id': playlist_id,
1674 'title': title,
1675 'entries': entries,
1676 }
04cc9617
JMF
1677
1678
b2e8bc1b 1679class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639 1680 """
25f14e9f 1681 Base class for feed extractors
d7ae0639
JMF
1682 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1683 """
b2e8bc1b 1684 _LOGIN_REQUIRED = True
d7ae0639
JMF
1685
1686 @property
1687 def IE_NAME(self):
78caa52a 1688 return 'youtube:%s' % self._FEED_NAME
04cc9617 1689
81f0259b 1690 def _real_initialize(self):
b2e8bc1b 1691 self._login()
81f0259b 1692
04cc9617 1693 def _real_extract(self, url):
25f14e9f
S
1694 page = self._download_webpage(
1695 'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)
2bc43303
JMF
1696
1697 # The extraction process is the same as for playlists, but the regex
1698 # for the video ids doesn't contain an index
1699 ids = []
1700 more_widget_html = content_html = page
2bc43303
JMF
1701 for page_num in itertools.count(1):
1702 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
62c95fd5
S
1703
1704 # 'recommended' feed has infinite 'load more' and each new portion spins
1705 # the same videos in (sometimes) slightly different order, so we'll check
1706 # for unicity and break when portion has no new videos
1707 new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))
1708 if not new_ids:
1709 break
1710
2bc43303
JMF
1711 ids.extend(new_ids)
1712
1713 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1714 if not mobj:
1715 break
1716
1717 more = self._download_json(
25f14e9f 1718 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
2bc43303
JMF
1719 'Downloading page #%s' % page_num,
1720 transform_source=uppercase_escape)
1721 content_html = more['content_html']
1722 more_widget_html = more['load_more_widget_html']
1723
25f14e9f
S
1724 return self.playlist_result(
1725 self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE)
1726
1727
1728class YoutubeWatchLaterIE(YoutubePlaylistIE):
1729 IE_NAME = 'youtube:watchlater'
1730 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
1731 _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'
1732
1733 _TESTS = [] # override PlaylistIE tests
1734
1735 def _real_extract(self, url):
1736 return self._extract_playlist('WL')
f459d170 1737
5f6a1245 1738
c626a3d9 1739class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 1740 IE_NAME = 'youtube:favorites'
f3a34072 1741 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
c7a7750d 1742 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1743 _LOGIN_REQUIRED = True
1744
1745 def _real_extract(self, url):
1746 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 1747 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 1748 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1749
1750
25f14e9f
S
1751class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1752 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
1753 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1754 _FEED_NAME = 'recommended'
1755 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1ed5b5c9 1756
1ed5b5c9 1757
25f14e9f
S
1758class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1759 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1760 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1761 _FEED_NAME = 'subscriptions'
1762 _PLAYLIST_TITLE = 'Youtube Subscriptions'
1ed5b5c9 1763
1ed5b5c9 1764
25f14e9f
S
1765class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1766 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
1767 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1768 _FEED_NAME = 'history'
1769 _PLAYLIST_TITLE = 'Youtube History'
1ed5b5c9
JMF
1770
1771
15870e90
PH
1772class YoutubeTruncatedURLIE(InfoExtractor):
1773 IE_NAME = 'youtube:truncated_url'
1774 IE_DESC = False # Do not list
975d35db 1775 _VALID_URL = r'''(?x)
b95aab84
PH
1776 (?:https?://)?
1777 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
1778 (?:watch\?(?:
c4808c60 1779 feature=[a-z_]+|
b95aab84
PH
1780 annotation_id=annotation_[^&]+|
1781 x-yt-cl=[0-9]+|
c1708b89 1782 hl=[^&]*|
b95aab84
PH
1783 )?
1784 |
1785 attribution_link\?a=[^&]+
1786 )
1787 $
975d35db 1788 '''
15870e90 1789
c4808c60
PH
1790 _TESTS = [{
1791 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1792 'only_matching': True,
dc2fc736
PH
1793 }, {
1794 'url': 'http://www.youtube.com/watch?',
1795 'only_matching': True,
b95aab84
PH
1796 }, {
1797 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
1798 'only_matching': True,
1799 }, {
1800 'url': 'https://www.youtube.com/watch?feature=foo',
1801 'only_matching': True,
c1708b89
PH
1802 }, {
1803 'url': 'https://www.youtube.com/watch?hl=en-GB',
1804 'only_matching': True,
c4808c60
PH
1805 }]
1806
15870e90
PH
1807 def _real_extract(self, url):
1808 raise ExtractorError(
78caa52a
PH
1809 'Did you forget to quote the URL? Remember that & is a meta '
1810 'character in most shells, so you want to put the URL in quotes, '
1811 'like youtube-dl '
1812 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1813 ' or simply youtube-dl BaW_jenozKc .',
15870e90 1814 expected=True)
772fd5cc
PH
1815
1816
1817class YoutubeTruncatedIDIE(InfoExtractor):
1818 IE_NAME = 'youtube:truncated_id'
1819 IE_DESC = False # Do not list
b95aab84 1820 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
1821
1822 _TESTS = [{
1823 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
1824 'only_matching': True,
1825 }]
1826
1827 def _real_extract(self, url):
1828 video_id = self._match_id(url)
1829 raise ExtractorError(
1830 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
1831 expected=True)