]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[options] Clarify --youtube-skip-dash-manifest
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
42939b61 10import time
e0df6211 11import traceback
c5e8d7af 12
b05654f0 13from .common import InfoExtractor, SearchInfoExtractor
2b25cb5d 14from ..jsinterp import JSInterpreter
54256267 15from ..swfinterp import SWFInterpreter
4bb4a188 16from ..compat import (
edf3e38e 17 compat_chr,
c5e8d7af 18 compat_parse_qs,
c5e8d7af
PH
19 compat_urllib_parse,
20 compat_urllib_request,
7c61bd36 21 compat_urlparse,
c5e8d7af 22 compat_str,
4bb4a188
PH
23)
24from ..utils import (
c5e8d7af 25 clean_html,
c5e8d7af 26 ExtractorError,
2d30521a 27 float_or_none,
4bb4a188
PH
28 get_element_by_attribute,
29 get_element_by_id,
dd27fd17 30 int_or_none,
4bb4a188 31 orderedSet,
c5e8d7af
PH
32 unescapeHTML,
33 unified_strdate,
81c2f20b 34 uppercase_escape,
c5e8d7af
PH
35)
36
5f6a1245 37
de7f3446 38class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
39 """Provide base functions for Youtube extractors"""
40 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
83317f69 41 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
b2e8bc1b
JMF
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
45
b2e8bc1b 46 def _set_language(self):
810fb84d
PH
47 self._set_cookie(
48 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
42939b61 49 # YouTube sets the expire time to about two months
810fb84d 50 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b 51
25f14e9f
S
52 def _ids_to_results(self, ids):
53 return [
54 self.url_result(vid_id, 'Youtube', video_id=vid_id)
55 for vid_id in ids]
56
b2e8bc1b 57 def _login(self):
83317f69 58 """
59 Attempt to log in to YouTube.
60 True is returned if successful or skipped.
61 False is returned if login failed.
62
63 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
64 """
b2e8bc1b
JMF
65 (username, password) = self._get_login_info()
66 # No authentication to be performed
67 if username is None:
68 if self._LOGIN_REQUIRED:
69ea8ca4 69 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 70 return True
b2e8bc1b 71
7cc3570e
PH
72 login_page = self._download_webpage(
73 self._LOGIN_URL, None,
69ea8ca4
PH
74 note='Downloading login page',
75 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
76 if login_page is False:
77 return
b2e8bc1b 78
795f28f8 79 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78caa52a 80 login_page, 'Login GALX parameter')
c5e8d7af 81
b2e8bc1b
JMF
82 # Log in
83 login_form_strs = {
8bcc8756
JW
84 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
85 'Email': username,
86 'GALX': galx,
87 'Passwd': password,
88
89 'PersistentCookie': 'yes',
90 '_utf8': '霱',
91 'bgresponse': 'js_disabled',
92 'checkConnection': '',
93 'checkedDomains': 'youtube',
94 'dnConn': '',
95 'pstMsg': '0',
96 'rmShown': '1',
97 'secTok': '',
98 'signIn': 'Sign in',
99 'timeStmp': '',
100 'service': 'youtube',
101 'uilel': '3',
102 'hl': 'en_US',
b2e8bc1b 103 }
83317f69 104
b2e8bc1b
JMF
105 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
106 # chokes on unicode
5f6a1245 107 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
b2e8bc1b 108 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
109
110 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
111 login_results = self._download_webpage(
112 req, None,
69ea8ca4 113 note='Logging in', errnote='unable to log in', fatal=False)
7cc3570e
PH
114 if login_results is False:
115 return False
83317f69 116
117 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
69ea8ca4 118 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
83317f69 119
120 # Two-Factor
121 # TODO add SMS and phone call support - these require making a request and then prompting the user
122
123 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
124 tfa_code = self._get_tfa_info()
125
126 if tfa_code is None:
69ea8ca4
PH
127 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
128 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
83317f69 129 return False
130
131 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
132
133 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
134 if match is None:
69ea8ca4 135 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
83317f69 136 secTok = match.group(1)
137 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
138 if match is None:
69ea8ca4 139 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
83317f69 140 timeStmp = match.group(1)
141
142 tfa_form_strs = {
78caa52a
PH
143 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
144 'smsToken': '',
145 'smsUserPin': tfa_code,
146 'smsVerifyPin': 'Verify',
147
148 'PersistentCookie': 'yes',
149 'checkConnection': '',
150 'checkedDomains': 'youtube',
151 'pstMsg': '1',
152 'secTok': secTok,
153 'timeStmp': timeStmp,
154 'service': 'youtube',
155 'hl': 'en_US',
83317f69 156 }
5f6a1245 157 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
83317f69 158 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
159
160 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
161 tfa_results = self._download_webpage(
162 tfa_req, None,
69ea8ca4 163 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
83317f69 164
165 if tfa_results is False:
166 return False
167
168 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
69ea8ca4 169 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
83317f69 170 return False
171 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
69ea8ca4 172 self._downloader.report_warning('unable to log in - did the page structure change?')
83317f69 173 return False
174 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
69ea8ca4 175 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
83317f69 176 return False
177
7cc3570e 178 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
69ea8ca4 179 self._downloader.report_warning('unable to log in: bad username or password')
b2e8bc1b
JMF
180 return False
181 return True
182
b2e8bc1b
JMF
183 def _real_initialize(self):
184 if self._downloader is None:
185 return
42939b61 186 self._set_language()
b2e8bc1b
JMF
187 if not self._login():
188 return
c5e8d7af 189
8377574c 190
360e1ca5 191class YoutubeIE(YoutubeBaseInfoExtractor):
78caa52a 192 IE_DESC = 'YouTube.com'
cb7dfeea 193 _VALID_URL = r"""(?x)^
c5e8d7af 194 (
edb53e2d 195 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 196 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 197 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 198 (?:www\.)?pwnyoutube\.com/|
f7000f3a 199 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
200 tube\.majestyc\.net/|
201 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
202 (?:.*?\#/)? # handle anchor (#/) redirect urls
203 (?: # the various things that can precede the ID:
ac7553d0 204 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 205 |(?: # or the v= param in all its forms
f7000f3a 206 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
207 (?:\?|\#!?) # the params delimiter ? or # or #!
208 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
209 v=
210 )
f4b05232
JMF
211 ))
212 |youtu\.be/ # just youtu.be/xxxx
edb53e2d 213 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 214 )
c5e8d7af 215 )? # all until now is optional -> you can pass the naked ID
8963d9c2 216 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
9291475f 217 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
c5e8d7af
PH
218 (?(1).+)? # if we found the ID, everything can follow
219 $"""
c5e8d7af 220 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
221 _formats = {
222 '5': {'ext': 'flv', 'width': 400, 'height': 240},
223 '6': {'ext': 'flv', 'width': 450, 'height': 270},
224 '13': {'ext': '3gp'},
225 '17': {'ext': '3gp', 'width': 176, 'height': 144},
226 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
227 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
228 '34': {'ext': 'flv', 'width': 640, 'height': 360},
229 '35': {'ext': 'flv', 'width': 854, 'height': 480},
230 '36': {'ext': '3gp', 'width': 320, 'height': 240},
231 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
232 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
233 '43': {'ext': 'webm', 'width': 640, 'height': 360},
234 '44': {'ext': 'webm', 'width': 854, 'height': 480},
235 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
236 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
c9bebed2
S
237 '59': {'ext': 'mp4', 'width': 854, 'height': 480},
238 '78': {'ext': 'mp4', 'width': 854, 'height': 480},
2c62dc26 239
1d043b93 240
86fe61c8 241 # 3d videos
43b81eb9
PH
242 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
243 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
244 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
245 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
246 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
247 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
248 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 249
96fb5605 250 # Apple HTTP Live Streaming
43b81eb9
PH
251 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
252 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
253 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
254 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
255 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
256 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
257 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
258
259 # DASH mp4 video
43b81eb9
PH
260 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
261 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
262 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
263 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
264 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
e65566a9 265 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
43b81eb9
PH
266 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
267 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
0d2c1418
PH
268 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
269 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
270 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
836a086c 271
f6f1fc92 272 # Dash mp4 audio
62cd676c
PH
273 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
274 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
275 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
836a086c
AZ
276
277 # Dash webm
e75cafe9
A
278 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
279 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
280 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
281 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
282 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
283 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
1cc887cb 284 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
e75cafe9
A
285 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
286 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
287 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
288 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
289 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
290 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
291 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 292 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 293 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
a51d3aa0
PH
294 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
295 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
76b3c610 296 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
10a404c3 297 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
76b3c610 298 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
2c62dc26
PH
299
300 # Dash webm audio
55db73ef 301 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
e75cafe9 302 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d 303
0857baad
PH
304 # Dash webm audio with opus inside
305 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
306 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
307 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
308
ce6b9a2d
PH
309 # RTMP (unnamed)
310 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 311 }
836a086c 312
78caa52a 313 IE_NAME = 'youtube'
2eb88d95
PH
314 _TESTS = [
315 {
4bc3a23e
PH
316 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
317 'info_dict': {
318 'id': 'BaW_jenozKc',
319 'ext': 'mp4',
320 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
321 'uploader': 'Philipp Hagemeister',
322 'uploader_id': 'phihag',
323 'upload_date': '20121002',
324 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
325 'categories': ['Science & Technology'],
3e7c1224
PH
326 'like_count': int,
327 'dislike_count': int,
2eb88d95 328 }
0e853ca4 329 },
0e853ca4 330 {
4bc3a23e
PH
331 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
332 'note': 'Test generic use_cipher_signature video (#897)',
333 'info_dict': {
334 'id': 'UxxajLWwzqY',
335 'ext': 'mp4',
336 'upload_date': '20120506',
337 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
338 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
339 'uploader': 'Icona Pop',
340 'uploader_id': 'IconaPop',
2eb88d95 341 }
c108eb73
JMF
342 },
343 {
4bc3a23e
PH
344 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
345 'note': 'Test VEVO video with age protection (#956)',
346 'info_dict': {
347 'id': '07FYdnEawAQ',
348 'ext': 'mp4',
349 'upload_date': '20130703',
350 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
351 'description': 'md5:64249768eec3bc4276236606ea996373',
352 'uploader': 'justintimberlakeVEVO',
353 'uploader_id': 'justintimberlakeVEVO',
c108eb73
JMF
354 }
355 },
fccd3771 356 {
4bc3a23e
PH
357 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
358 'note': 'Embed-only video (#1746)',
359 'info_dict': {
360 'id': 'yZIXLfi8CZQ',
361 'ext': 'mp4',
362 'upload_date': '20120608',
363 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
364 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
365 'uploader': 'SET India',
366 'uploader_id': 'setindia'
fccd3771
PH
367 }
368 },
dd27fd17 369 {
4bc3a23e
PH
370 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
371 'note': '256k DASH audio (format 141) via DASH manifest',
372 'info_dict': {
373 'id': 'a9LDPn-MO4I',
374 'ext': 'm4a',
375 'upload_date': '20121002',
376 'uploader_id': '8KVIDEO',
377 'description': '',
378 'uploader': '8KVIDEO',
379 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 380 },
4bc3a23e
PH
381 'params': {
382 'youtube_include_dash_manifest': True,
383 'format': '141',
4919603f 384 },
dd27fd17 385 },
3489b7d2
JMF
386 # DASH manifest with encrypted signature
387 {
78caa52a
PH
388 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
389 'info_dict': {
390 'id': 'IB3lcPjvWLA',
391 'ext': 'm4a',
b766eb27
JMF
392 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
393 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
78caa52a
PH
394 'uploader': 'AfrojackVEVO',
395 'uploader_id': 'AfrojackVEVO',
396 'upload_date': '20131011',
3489b7d2 397 },
4bc3a23e 398 'params': {
78caa52a
PH
399 'youtube_include_dash_manifest': True,
400 'format': '141',
3489b7d2
JMF
401 },
402 },
aaeb86f6
S
403 # JS player signature function name containing $
404 {
405 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
406 'info_dict': {
407 'id': 'nfWlot6h_JM',
408 'ext': 'm4a',
409 'title': 'Taylor Swift - Shake It Off',
410 'description': 'md5:2acfda1b285bdd478ccec22f9918199d',
411 'uploader': 'TaylorSwiftVEVO',
412 'uploader_id': 'TaylorSwiftVEVO',
413 'upload_date': '20140818',
414 },
415 'params': {
416 'youtube_include_dash_manifest': True,
417 'format': '141',
418 },
419 },
aa79ac0c
PH
420 # Controversy video
421 {
422 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
423 'info_dict': {
424 'id': 'T4XJQO3qol8',
425 'ext': 'mp4',
426 'upload_date': '20100909',
427 'uploader': 'The Amazing Atheist',
428 'uploader_id': 'TheAmazingAtheist',
429 'title': 'Burning Everyone\'s Koran',
430 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
431 }
c522adb1
JMF
432 },
433 # Normal age-gate video (No vevo, embed allowed)
434 {
435 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
436 'info_dict': {
437 'id': 'HtVdAasjOgU',
438 'ext': 'mp4',
439 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
9ed99402 440 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
c522adb1
JMF
441 'uploader': 'The Witcher',
442 'uploader_id': 'WitcherGame',
443 'upload_date': '20140605',
444 },
445 },
fccae2b9
S
446 # Age-gate video with encrypted signature
447 {
448 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',
449 'info_dict': {
450 'id': '6kLq3WMV1nU',
451 'ext': 'mp4',
452 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
453 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
454 'uploader': 'LloydVEVO',
455 'uploader_id': 'LloydVEVO',
456 'upload_date': '20110629',
457 },
458 },
774e208f
PH
459 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
460 {
461 'url': '__2ABJjxzNo',
462 'info_dict': {
463 'id': '__2ABJjxzNo',
464 'ext': 'mp4',
465 'upload_date': '20100430',
466 'uploader_id': 'deadmau5',
467 'description': 'md5:12c56784b8032162bb936a5f76d55360',
468 'uploader': 'deadmau5',
469 'title': 'Deadmau5 - Some Chords (HD)',
470 },
471 'expected_warnings': [
472 'DASH manifest missing',
473 ]
e52a40ab
PH
474 },
475 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
476 {
477 'url': 'lqQg6PlCWgI',
478 'info_dict': {
479 'id': 'lqQg6PlCWgI',
480 'ext': 'mp4',
cbe2bd91
PH
481 'upload_date': '20120731',
482 'uploader_id': 'olympic',
483 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
484 'uploader': 'Olympics',
485 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
486 },
487 'params': {
488 'skip_download': 'requires avconv',
e52a40ab 489 }
cbe2bd91 490 },
6271f1ca
PH
491 # Non-square pixels
492 {
493 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
494 'info_dict': {
495 'id': '_b-2C3KPAM0',
496 'ext': 'mp4',
497 'stretched_ratio': 16 / 9.,
498 'upload_date': '20110310',
499 'uploader_id': 'AllenMeow',
500 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
501 'uploader': '孫艾倫',
502 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
503 },
06b491eb
S
504 },
505 # url_encoded_fmt_stream_map is empty string
506 {
507 'url': 'qEJwOuvDf7I',
508 'info_dict': {
509 'id': 'qEJwOuvDf7I',
510 'ext': 'mp4',
511 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
512 'description': '',
513 'upload_date': '20150404',
514 'uploader_id': 'spbelect',
515 'uploader': 'Наблюдатели Петербурга',
516 },
517 'params': {
518 'skip_download': 'requires avconv',
519 }
520 },
2eb88d95
PH
521 ]
522
e0df6211
PH
523 def __init__(self, *args, **kwargs):
524 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 525 self._player_cache = {}
e0df6211 526
c5e8d7af
PH
527 def report_video_info_webpage_download(self, video_id):
528 """Report attempt to download video info webpage."""
69ea8ca4 529 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 530
c5e8d7af
PH
531 def report_information_extraction(self, video_id):
532 """Report attempt to extract video information."""
69ea8ca4 533 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
534
535 def report_unavailable_format(self, video_id, format):
536 """Report extracted video URL."""
69ea8ca4 537 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
538
539 def report_rtmp_download(self):
540 """Indicate the download will use the RTMP protocol."""
69ea8ca4 541 self.to_screen('RTMP download detected')
c5e8d7af 542
60064c53
PH
543 def _signature_cache_id(self, example_sig):
544 """ Return a string representation of a signature """
78caa52a 545 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
546
547 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 548 id_m = re.match(
60620368 549 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
cf010131 550 player_url)
c081b35c
PH
551 if not id_m:
552 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
553 player_type = id_m.group('ext')
554 player_id = id_m.group('id')
555
c4417ddb 556 # Read from filesystem cache
60064c53
PH
557 func_id = '%s_%s_%s' % (
558 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 559 assert os.path.basename(func_id) == func_id
a0e07d31 560
69ea8ca4 561 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 562 if cache_spec is not None:
78caa52a 563 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 564
6d1a55a5
PH
565 download_note = (
566 'Downloading player %s' % player_url
567 if self._downloader.params.get('verbose') else
568 'Downloading %s player %s' % (player_type, player_id)
569 )
e0df6211
PH
570 if player_type == 'js':
571 code = self._download_webpage(
572 player_url, video_id,
6d1a55a5 573 note=download_note,
69ea8ca4 574 errnote='Download of %s failed' % player_url)
83799698 575 res = self._parse_sig_js(code)
c4417ddb 576 elif player_type == 'swf':
e0df6211
PH
577 urlh = self._request_webpage(
578 player_url, video_id,
6d1a55a5 579 note=download_note,
69ea8ca4 580 errnote='Download of %s failed' % player_url)
e0df6211 581 code = urlh.read()
83799698 582 res = self._parse_sig_swf(code)
e0df6211
PH
583 else:
584 assert False, 'Invalid player type %r' % player_type
585
785521bf
PH
586 test_string = ''.join(map(compat_chr, range(len(example_sig))))
587 cache_res = res(test_string)
588 cache_spec = [ord(c) for c in cache_res]
83799698 589
69ea8ca4 590 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
591 return res
592
60064c53 593 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
594 def gen_sig_code(idxs):
595 def _genslice(start, end, step):
78caa52a 596 starts = '' if start == 0 else str(start)
8bcc8756 597 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 598 steps = '' if step == 1 else (':%d' % step)
78caa52a 599 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
600
601 step = None
7af808a5
PH
602 # Quelch pyflakes warnings - start will be set when step is set
603 start = '(Never used)'
edf3e38e
PH
604 for i, prev in zip(idxs[1:], idxs[:-1]):
605 if step is not None:
606 if i - prev == step:
607 continue
608 yield _genslice(start, prev, step)
609 step = None
610 continue
611 if i - prev in [-1, 1]:
612 step = i - prev
613 start = prev
614 continue
615 else:
78caa52a 616 yield 's[%d]' % prev
edf3e38e 617 if step is None:
78caa52a 618 yield 's[%d]' % i
edf3e38e
PH
619 else:
620 yield _genslice(start, i, step)
621
78caa52a 622 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 623 cache_res = func(test_string)
edf3e38e 624 cache_spec = [ord(c) for c in cache_res]
78caa52a 625 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
626 signature_id_tuple = '(%s)' % (
627 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 628 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 629 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 630 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 631
e0df6211
PH
632 def _parse_sig_js(self, jscode):
633 funcname = self._search_regex(
aaeb86f6 634 r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode,
8bcc8756 635 'Initial JS player signature function name')
2b25cb5d
PH
636
637 jsi = JSInterpreter(jscode)
638 initial_function = jsi.extract_function(funcname)
e0df6211
PH
639 return lambda s: initial_function([s])
640
641 def _parse_sig_swf(self, file_contents):
54256267 642 swfi = SWFInterpreter(file_contents)
78caa52a 643 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 644 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 645 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
646 return lambda s: initial_function([s])
647
83799698 648 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 649 """Turn the encrypted s field into a working signature"""
6b37f0be 650
c8bf86d5 651 if player_url is None:
69ea8ca4 652 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 653
69ea8ca4 654 if player_url.startswith('//'):
78caa52a 655 player_url = 'https:' + player_url
c8bf86d5 656 try:
62af3a0e 657 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
658 if player_id not in self._player_cache:
659 func = self._extract_signature_function(
60064c53 660 video_id, player_url, s
c8bf86d5
PH
661 )
662 self._player_cache[player_id] = func
663 func = self._player_cache[player_id]
664 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 665 self._print_sig_code(func, s)
c8bf86d5
PH
666 return func(s)
667 except Exception as e:
668 tb = traceback.format_exc()
669 raise ExtractorError(
78caa52a 670 'Signature extraction failed: ' + tb, cause=e)
e0df6211 671
360e1ca5 672 def _get_subtitles(self, video_id, webpage):
de7f3446 673 try:
60e47a26 674 subs_doc = self._download_xml(
38c2e5b8 675 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
676 video_id, note=False)
677 except ExtractorError as err:
69ea8ca4 678 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
de7f3446 679 return {}
de7f3446
JMF
680
681 sub_lang_list = {}
60e47a26
JMF
682 for track in subs_doc.findall('track'):
683 lang = track.attrib['lang_code']
7e660ac1
LD
684 if lang in sub_lang_list:
685 continue
360e1ca5
JMF
686 sub_formats = []
687 for ext in ['sbv', 'vtt', 'srt']:
688 params = compat_urllib_parse.urlencode({
689 'lang': lang,
690 'v': video_id,
691 'fmt': ext,
692 'name': track.attrib['name'].encode('utf-8'),
693 })
694 sub_formats.append({
695 'url': 'https://www.youtube.com/api/timedtext?' + params,
696 'ext': ext,
697 })
698 sub_lang_list[lang] = sub_formats
de7f3446 699 if not sub_lang_list:
69ea8ca4 700 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
701 return {}
702 return sub_lang_list
703
360e1ca5 704 def _get_automatic_captions(self, video_id, webpage):
de7f3446
JMF
705 """We need the webpage for getting the captions url, pass it as an
706 argument to speed up the process."""
69ea8ca4 707 self.to_screen('%s: Looking for automatic captions' % video_id)
de7f3446 708 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
78caa52a 709 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
710 if mobj is None:
711 self._downloader.report_warning(err_msg)
712 return {}
713 player_config = json.loads(mobj.group(1))
714 try:
0792d563
PH
715 args = player_config['args']
716 caption_url = args['ttsurl']
717 timestamp = args['timestamp']
055e6f36
JMF
718 # We get the available subtitles
719 list_params = compat_urllib_parse.urlencode({
720 'type': 'list',
721 'tlangs': 1,
722 'asrs': 1,
de7f3446 723 })
055e6f36 724 list_url = caption_url + '&' + list_params
e26f8712 725 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 726 original_lang_node = caption_list.find('track')
7d900ef1 727 if original_lang_node is None:
69ea8ca4 728 self._downloader.report_warning('Video doesn\'t have automatic captions')
e3dc22ca
JMF
729 return {}
730 original_lang = original_lang_node.attrib['lang_code']
7d900ef1 731 caption_kind = original_lang_node.attrib.get('kind', '')
055e6f36
JMF
732
733 sub_lang_list = {}
734 for lang_node in caption_list.findall('target'):
735 sub_lang = lang_node.attrib['lang_code']
360e1ca5
JMF
736 sub_formats = []
737 for ext in ['sbv', 'vtt', 'srt']:
738 params = compat_urllib_parse.urlencode({
739 'lang': original_lang,
740 'tlang': sub_lang,
741 'fmt': ext,
742 'ts': timestamp,
743 'kind': caption_kind,
744 })
745 sub_formats.append({
746 'url': caption_url + '&' + params,
747 'ext': ext,
748 })
749 sub_lang_list[sub_lang] = sub_formats
055e6f36 750 return sub_lang_list
de7f3446
JMF
751 # An extractor error can be raise by the download process if there are
752 # no automatic captions but there are subtitles
753 except (KeyError, ExtractorError):
754 self._downloader.report_warning(err_msg)
755 return {}
756
97665381
PH
757 @classmethod
758 def extract_id(cls, url):
759 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 760 if mobj is None:
69ea8ca4 761 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
762 video_id = mobj.group(2)
763 return video_id
764
1d043b93
JMF
765 def _extract_from_m3u8(self, manifest_url, video_id):
766 url_map = {}
5f6a1245 767
1d043b93
JMF
768 def _get_urls(_manifest):
769 lines = _manifest.split('\n')
770 urls = filter(lambda l: l and not l.startswith('#'),
8bcc8756 771 lines)
1d043b93 772 return urls
78caa52a 773 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
1d043b93
JMF
774 formats_urls = _get_urls(manifest)
775 for format_url in formats_urls:
890f62e8 776 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
777 url_map[itag] = format_url
778 return url_map
779
1fb07d10
JG
780 def _extract_annotations(self, video_id):
781 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 782 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 783
da276600
PH
784 def _parse_dash_manifest(
785 self, video_id, dash_manifest_url, player_url, age_gate):
774e208f
PH
786 def decrypt_sig(mobj):
787 s = mobj.group(1)
788 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
789 return '/signature/%s' % dec_s
e1b9322b 790 dash_manifest_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, dash_manifest_url)
774e208f
PH
791 dash_doc = self._download_xml(
792 dash_manifest_url, video_id,
793 note='Downloading DASH manifest',
794 errnote='Could not download DASH manifest')
795
796 formats = []
de5c5456
YCH
797 for a in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}AdaptationSet'):
798 mime_type = a.attrib.get('mimeType')
799 for r in a.findall('{urn:mpeg:DASH:schema:MPD:2011}Representation'):
800 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
801 if url_el is None:
802 continue
803 if mime_type == 'text/vtt':
804 # TODO implement WebVTT downloading
805 pass
806 elif mime_type.startswith('audio/') or mime_type.startswith('video/'):
807 format_id = r.attrib['id']
808 video_url = url_el.text
809 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
810 f = {
811 'format_id': format_id,
812 'url': video_url,
813 'width': int_or_none(r.attrib.get('width')),
814 'height': int_or_none(r.attrib.get('height')),
815 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
816 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
817 'filesize': filesize,
818 'fps': int_or_none(r.attrib.get('frameRate')),
819 }
820 try:
821 existing_format = next(
822 fo for fo in formats
823 if fo['format_id'] == format_id)
824 except StopIteration:
825 full_info = self._formats.get(format_id, {}).copy()
826 full_info.update(f)
827 formats.append(full_info)
828 else:
829 existing_format.update(f)
830 else:
831 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
774e208f
PH
832 return formats
833
c5e8d7af 834 def _real_extract(self, url):
7e8c0af0 835 proto = (
78caa52a
PH
836 'http' if self._downloader.params.get('prefer_insecure', False)
837 else 'https')
7e8c0af0 838
c5e8d7af
PH
839 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
840 mobj = re.search(self._NEXT_URL_RE, url)
841 if mobj:
7e8c0af0 842 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
97665381 843 video_id = self.extract_id(url)
c5e8d7af
PH
844
845 # Get video webpage
aa79ac0c 846 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
a1f934b1 847 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
848
849 # Attempt to extract SWF player URL
e0df6211 850 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
851 if mobj is not None:
852 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
853 else:
854 player_url = None
855
d8d24a92
S
856 dash_mpds = []
857
858 def add_dash_mpd(video_info):
859 dash_mpd = video_info.get('dashmpd')
860 if dash_mpd and dash_mpd[0] not in dash_mpds:
861 dash_mpds.append(dash_mpd[0])
862
c5e8d7af 863 # Get video info
6449cd80 864 embed_webpage = None
c108eb73 865 if re.search(r'player-age-gate-content">', video_webpage) is not None:
c108eb73
JMF
866 age_gate = True
867 # We simulate the access to the video from www.youtube.com/v/{video_id}
868 # this can be viewed without login into Youtube
beb95e77
CL
869 url = proto + '://www.youtube.com/embed/%s' % video_id
870 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
2c57c7fa
JMF
871 data = compat_urllib_parse.urlencode({
872 'video_id': video_id,
873 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934 874 'sts': self._search_regex(
beb95e77 875 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
2c57c7fa 876 })
7e8c0af0 877 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
94bd3613
PH
878 video_info_webpage = self._download_webpage(
879 video_info_url, video_id,
20436c30 880 note='Refetching age-gated info webpage',
94bd3613 881 errnote='unable to download video info webpage')
c5e8d7af 882 video_info = compat_parse_qs(video_info_webpage)
d8d24a92 883 add_dash_mpd(video_info)
c108eb73
JMF
884 else:
885 age_gate = False
bc93bdb5 886 video_info = None
d8d24a92
S
887 # Try looking directly into the video webpage
888 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
889 if mobj:
4e62ebe2
JMF
890 json_code = uppercase_escape(mobj.group(1))
891 ytplayer_config = json.loads(json_code)
892 args = ytplayer_config['args']
d8d24a92
S
893 if args.get('url_encoded_fmt_stream_map'):
894 # Convert to the same format returned by compat_parse_qs
895 video_info = dict((k, [v]) for k, v in args.items())
896 add_dash_mpd(video_info)
0a3cf9ad
S
897 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
898 # We also try looking in get_video_info since it may contain different dashmpd
899 # URL that points to a DASH manifest with possibly different itag set (some itags
900 # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
901 # manifest pointed by get_video_info's dashmpd).
902 # The general idea is to take a union of itags of both DASH manifests (for example
903 # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
904 self.report_video_info_webpage_download(video_id)
905 for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:
906 video_info_url = (
907 '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
908 % (proto, video_id, el_type))
909 video_info_webpage = self._download_webpage(
910 video_info_url,
911 video_id, note=False,
912 errnote='unable to download video info webpage')
913 get_video_info = compat_parse_qs(video_info_webpage)
914 add_dash_mpd(get_video_info)
915 if not video_info:
916 video_info = get_video_info
917 if 'token' in get_video_info:
918 break
c5e8d7af
PH
919 if 'token' not in video_info:
920 if 'reason' in video_info:
d11271dd 921 raise ExtractorError(
78caa52a 922 'YouTube said: %s' % video_info['reason'][0],
d11271dd 923 expected=True, video_id=video_id)
c5e8d7af 924 else:
d11271dd 925 raise ExtractorError(
78caa52a 926 '"token" parameter not in video info for unknown reason',
d11271dd 927 video_id=video_id)
c5e8d7af 928
1d699755
PH
929 if 'view_count' in video_info:
930 view_count = int(video_info['view_count'][0])
931 else:
932 view_count = None
933
c5e8d7af
PH
934 # Check for "rental" videos
935 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
69ea8ca4 936 raise ExtractorError('"rental" videos not supported')
c5e8d7af
PH
937
938 # Start extracting information
939 self.report_information_extraction(video_id)
940
941 # uploader
942 if 'author' not in video_info:
69ea8ca4 943 raise ExtractorError('Unable to extract uploader name')
c5e8d7af
PH
944 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
945
946 # uploader_id
947 video_uploader_id = None
948 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
949 if mobj is not None:
950 video_uploader_id = mobj.group(1)
951 else:
69ea8ca4 952 self._downloader.report_warning('unable to extract uploader nickname')
c5e8d7af
PH
953
954 # title
a8c6b241 955 if 'title' in video_info:
aa92f063 956 video_title = video_info['title'][0]
a8c6b241 957 else:
69ea8ca4 958 self._downloader.report_warning('Unable to extract video title')
78caa52a 959 video_title = '_'
c5e8d7af
PH
960
961 # thumbnail image
7763b04e
JMF
962 # We try first to get a high quality image:
963 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
964 video_webpage, re.DOTALL)
965 if m_thumb is not None:
966 video_thumbnail = m_thumb.group(1)
967 elif 'thumbnail_url' not in video_info:
69ea8ca4 968 self._downloader.report_warning('unable to extract video thumbnail')
f490e77e 969 video_thumbnail = None
c5e8d7af
PH
970 else: # don't panic if we can't find it
971 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
972
973 # upload date
974 upload_date = None
ad3bc6ac 975 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
beee53de
PH
976 if mobj is None:
977 mobj = re.search(
263bd4ec 978 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
beee53de 979 video_webpage)
c5e8d7af
PH
980 if mobj is not None:
981 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
982 upload_date = unified_strdate(upload_date)
983
55f7bd2d
PH
984 m_cat_container = self._search_regex(
985 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
624dcebf 986 video_webpage, 'categories', default=None)
ec8deefc 987 if m_cat_container:
ad3bc6ac 988 category = self._html_search_regex(
01ed5c9b 989 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
990 default=None)
991 video_categories = None if category is None else [category]
992 else:
993 video_categories = None
ec8deefc 994
c5e8d7af
PH
995 # description
996 video_description = get_element_by_id("eow-description", video_webpage)
997 if video_description:
27dcce19
PH
998 video_description = re.sub(r'''(?x)
999 <a\s+
1000 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1001 title="([^"]+)"\s+
1002 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1003 class="yt-uix-redirect-link"\s*>
1004 [^<]+
1005 </a>
1006 ''', r'\1', video_description)
c5e8d7af
PH
1007 video_description = clean_html(video_description)
1008 else:
1009 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1010 if fd_mobj:
1011 video_description = unescapeHTML(fd_mobj.group(1))
1012 else:
78caa52a 1013 video_description = ''
c5e8d7af 1014
f30a38be 1015 def _extract_count(count_name):
46374a56 1016 count = self._search_regex(
f30a38be
JMF
1017 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
1018 video_webpage, count_name, default=None)
336c3a69
JMF
1019 if count is not None:
1020 return int(count.replace(',', ''))
1021 return None
69ea8ca4
PH
1022 like_count = _extract_count('like')
1023 dislike_count = _extract_count('dislike')
336c3a69 1024
c5e8d7af 1025 # subtitles
d82134c3 1026 video_subtitles = self.extract_subtitles(video_id, video_webpage)
360e1ca5 1027 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
c5e8d7af
PH
1028
1029 if 'length_seconds' not in video_info:
69ea8ca4 1030 self._downloader.report_warning('unable to extract video duration')
b466b702 1031 video_duration = None
c5e8d7af 1032 else:
b466b702 1033 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1034
1fb07d10
JG
1035 # annotations
1036 video_annotations = None
1037 if self._downloader.params.get('writeannotations', False):
5f6a1245 1038 video_annotations = self._extract_annotations(video_id)
1fb07d10 1039
dd27fd17
PH
1040 def _map_to_format_list(urlmap):
1041 formats = []
1042 for itag, video_real_url in urlmap.items():
1043 dct = {
1044 'format_id': itag,
1045 'url': video_real_url,
1046 'player_url': player_url,
1047 }
0b65e5d4
PH
1048 if itag in self._formats:
1049 dct.update(self._formats[itag])
dd27fd17
PH
1050 formats.append(dct)
1051 return formats
1052
c5e8d7af
PH
1053 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1054 self.report_rtmp_download()
dd27fd17
PH
1055 formats = [{
1056 'format_id': '_rtmp',
1057 'protocol': 'rtmp',
1058 'url': video_info['conn'][0],
1059 'player_url': player_url,
1060 }]
24270b03 1061 elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
5f6a1245 1062 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 1063 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1064 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1065 url_map = {}
00fe14fc 1066 for url_data_str in encoded_url_map.split(','):
c5e8d7af 1067 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
1068 if 'itag' not in url_data or 'url' not in url_data:
1069 continue
1070 format_id = url_data['itag'][0]
1071 url = url_data['url'][0]
1072
1073 if 'sig' in url_data:
1074 url += '&signature=' + url_data['sig'][0]
1075 elif 's' in url_data:
1076 encrypted_sig = url_data['s'][0]
6449cd80 1077 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
201e9eaa 1078
beb95e77 1079 jsplayer_url_json = self._search_regex(
6449cd80
PH
1080 ASSETS_RE,
1081 embed_webpage if age_gate else video_webpage,
1082 'JS player URL (1)', default=None)
1083 if not jsplayer_url_json and not age_gate:
1084 # We need the embed website after all
1085 if embed_webpage is None:
1086 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1087 embed_webpage = self._download_webpage(
1088 embed_url, video_id, 'Downloading embed webpage')
1089 jsplayer_url_json = self._search_regex(
1090 ASSETS_RE, embed_webpage, 'JS player URL')
1091
beb95e77 1092 player_url = json.loads(jsplayer_url_json)
201e9eaa
PH
1093 if player_url is None:
1094 player_url_json = self._search_regex(
1095 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 1096 video_webpage, 'age gate player URL')
201e9eaa
PH
1097 player_url = json.loads(player_url_json)
1098
1099 if self._downloader.params.get('verbose'):
cf010131 1100 if player_url is None:
201e9eaa
PH
1101 player_version = 'unknown'
1102 player_desc = 'unknown'
1103 else:
1104 if player_url.endswith('swf'):
1105 player_version = self._search_regex(
1106 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 1107 'flash player', fatal=False)
201e9eaa 1108 player_desc = 'flash player %s' % player_version
cf010131 1109 else:
201e9eaa
PH
1110 player_version = self._search_regex(
1111 r'html5player-([^/]+?)(?:/html5player)?\.js',
1112 player_url,
1113 'html5 player', fatal=False)
78caa52a 1114 player_desc = 'html5 player %s' % player_version
201e9eaa 1115
60064c53 1116 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 1117 self.to_screen('{%s} signature length %s, %s' %
9e1a5b84 1118 (format_id, parts_sizes, player_desc))
201e9eaa
PH
1119
1120 signature = self._decrypt_signature(
1121 encrypted_sig, video_id, player_url, age_gate)
1122 url += '&signature=' + signature
1123 if 'ratebypass' not in url:
1124 url += '&ratebypass=yes'
1125 url_map[format_id] = url
dd27fd17 1126 formats = _map_to_format_list(url_map)
1d043b93
JMF
1127 elif video_info.get('hlsvp'):
1128 manifest_url = video_info['hlsvp'][0]
1129 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1130 formats = _map_to_format_list(url_map)
c5e8d7af 1131 else:
69ea8ca4 1132 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1133
dd27fd17 1134 # Look for the DASH manifest
203fb43f 1135 if self._downloader.params.get('youtube_include_dash_manifest', True):
d8d24a92
S
1136 for dash_manifest_url in dash_mpds:
1137 dash_formats = {}
774e208f 1138 try:
d8d24a92
S
1139 for df in self._parse_dash_manifest(
1140 video_id, dash_manifest_url, player_url, age_gate):
1141 # Do not overwrite DASH format found in some previous DASH manifest
1142 if df['format_id'] not in dash_formats:
1143 dash_formats[df['format_id']] = df
774e208f
PH
1144 except (ExtractorError, KeyError) as e:
1145 self.report_warning(
1146 'Skipping DASH manifest: %r' % e, video_id)
d8d24a92 1147 if dash_formats:
04b3b3df
JMF
1148 # Remove the formats we found through non-DASH, they
1149 # contain less info and it can be wrong, because we use
1150 # fixed values (for example the resolution). See
1151 # https://github.com/rg3/youtube-dl/issues/5774 for an
1152 # example.
d80265cc 1153 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
d8d24a92 1154 formats.extend(dash_formats.values())
d80044c2 1155
6271f1ca
PH
1156 # Check for malformed aspect ratio
1157 stretched_m = re.search(
1158 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
1159 video_webpage)
1160 if stretched_m:
1161 ratio = float(stretched_m.group('w')) / float(stretched_m.group('h'))
1162 for f in formats:
1163 if f.get('vcodec') != 'none':
1164 f['stretched_ratio'] = ratio
1165
4bcc7bd1 1166 self._sort_formats(formats)
4ea3be0a 1167
1168 return {
8bcc8756
JW
1169 'id': video_id,
1170 'uploader': video_uploader,
1171 'uploader_id': video_uploader_id,
1172 'upload_date': upload_date,
1173 'title': video_title,
1174 'thumbnail': video_thumbnail,
1175 'description': video_description,
1176 'categories': video_categories,
1177 'subtitles': video_subtitles,
360e1ca5 1178 'automatic_captions': automatic_captions,
8bcc8756
JW
1179 'duration': video_duration,
1180 'age_limit': 18 if age_gate else 0,
1181 'annotations': video_annotations,
7e8c0af0 1182 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 1183 'view_count': view_count,
4ea3be0a 1184 'like_count': like_count,
1185 'dislike_count': dislike_count,
2d30521a 1186 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
8bcc8756 1187 'formats': formats,
4ea3be0a 1188 }
c5e8d7af 1189
5f6a1245 1190
880e1c52 1191class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
78caa52a 1192 IE_DESC = 'YouTube.com playlists'
d67cc9fa 1193 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1194 (?:https?://)?
1195 (?:\w+\.)?
1196 youtube\.com/
1197 (?:
ac7553d0 1198 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
c5e8d7af
PH
1199 \? (?:.*?&)*? (?:p|a|list)=
1200 | p/
1201 )
d67cc9fa 1202 (
99209c29 1203 (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,}
5f6a1245 1204 # Top tracks, they can also include dots
d67cc9fa
JMF
1205 |(?:MC)[\w\.]*
1206 )
c5e8d7af
PH
1207 .*
1208 |
99209c29 1209 ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})
c5e8d7af 1210 )"""
dbb94fb0 1211 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dbb94fb0 1212 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
78caa52a 1213 IE_NAME = 'youtube:playlist'
81127aa5
PH
1214 _TESTS = [{
1215 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1216 'info_dict': {
1217 'title': 'ytdl test PL',
a1cf99d0 1218 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
81127aa5
PH
1219 },
1220 'playlist_count': 3,
9291475f
PH
1221 }, {
1222 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1223 'info_dict': {
acf757f4 1224 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
9291475f
PH
1225 'title': 'YDL_Empty_List',
1226 },
1227 'playlist_count': 0,
1228 }, {
1229 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1230 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1231 'info_dict': {
1232 'title': '29C3: Not my department',
acf757f4 1233 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
9291475f
PH
1234 },
1235 'playlist_count': 95,
1236 }, {
1237 'note': 'issue #673',
1238 'url': 'PLBB231211A4F62143',
1239 'info_dict': {
f46a8702 1240 'title': '[OLD]Team Fortress 2 (Class-based LP)',
acf757f4 1241 'id': 'PLBB231211A4F62143',
9291475f
PH
1242 },
1243 'playlist_mincount': 26,
1244 }, {
1245 'note': 'Large playlist',
1246 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1247 'info_dict': {
1248 'title': 'Uploads from Cauchemar',
acf757f4 1249 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
9291475f
PH
1250 },
1251 'playlist_mincount': 799,
1252 }, {
1253 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1254 'info_dict': {
1255 'title': 'YDL_safe_search',
acf757f4 1256 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
9291475f
PH
1257 },
1258 'playlist_count': 2,
ac7553d0
PH
1259 }, {
1260 'note': 'embedded',
1261 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1262 'playlist_count': 4,
1263 'info_dict': {
1264 'title': 'JODA15',
acf757f4 1265 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
ac7553d0 1266 }
6b08cdf6
PH
1267 }, {
1268 'note': 'Embedded SWF player',
1269 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1270 'playlist_count': 4,
1271 'info_dict': {
1272 'title': 'JODA7',
acf757f4 1273 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
6b08cdf6 1274 }
4b7df0d3
JMF
1275 }, {
1276 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
1277 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
1278 'info_dict': {
acf757f4
PH
1279 'title': 'Uploads from Interstellar Movie',
1280 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
4b7df0d3
JMF
1281 },
1282 'playlist_mincout': 21,
81127aa5 1283 }]
c5e8d7af 1284
880e1c52
JMF
1285 def _real_initialize(self):
1286 self._login()
1287
652cdaa2 1288 def _extract_mix(self, playlist_id):
99209c29 1289 # The mixes are generated from a single video
652cdaa2 1290 # the id of the playlist is just 'RD' + video_id
7d4afc55 1291 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
c9cc0bf5 1292 webpage = self._download_webpage(
78caa52a 1293 url, playlist_id, 'Downloading Youtube mix')
bc2f773b 1294 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
1295 title_span = (
1296 search_title('playlist-title') or
1297 search_title('title long-title') or
1298 search_title('title'))
76d1700b 1299 title = clean_html(title_span)
c9cc0bf5
PH
1300 ids = orderedSet(re.findall(
1301 r'''(?xs)data-video-username=".*?".*?
1302 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1303 webpage))
652cdaa2
JMF
1304 url_results = self._ids_to_results(ids)
1305
1306 return self.playlist_result(url_results, playlist_id, title)
1307
448830ce 1308 def _extract_playlist(self, playlist_id):
dbb94fb0
S
1309 url = self._TEMPLATE_URL % playlist_id
1310 page = self._download_webpage(url, playlist_id)
dbb94fb0 1311
39b62db1
YCH
1312 for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page):
1313 match = match.strip()
1314 # Check if the playlist exists or is private
1315 if re.match(r'[^<]*(The|This) playlist (does not exist|is private)[^<]*', match):
1316 raise ExtractorError(
1317 'The playlist doesn\'t exist or is private, use --username or '
1318 '--netrc to access it.',
1319 expected=True)
1320 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
1321 raise ExtractorError(
1322 'Invalid parameters. Maybe URL is incorrect.',
1323 expected=True)
1324 elif re.match(r'[^<]*Choose your language[^<]*', match):
1325 continue
1326 else:
1327 self.report_warning('Youtube gives an alert message: ' + match)
10c0e2d8 1328
dcbb4580 1329 # Extract the video ids from the playlist pages
70219b0f
JMF
1330 def _entries():
1331 more_widget_html = content_html = page
1332 for page_num in itertools.count(1):
1333 matches = re.finditer(self._VIDEO_RE, content_html)
1334 # We remove the duplicates and the link with index 0
1335 # (it's not the first video of the playlist)
1336 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1337 for vid_id in new_ids:
1338 yield self.url_result(vid_id, 'Youtube', video_id=vid_id)
1339
1340 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1341 if not mobj:
1342 break
1343
1344 more = self._download_json(
1345 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1346 'Downloading page #%s' % page_num,
1347 transform_source=uppercase_escape)
1348 content_html = more['content_html']
1349 if not content_html.strip():
1350 # Some webpages show a "Load more" button but they don't
1351 # have more videos
1352 break
1353 more_widget_html = more['load_more_widget_html']
dbb94fb0
S
1354
1355 playlist_title = self._html_search_regex(
68eb8e90 1356 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
78caa52a 1357 page, 'title')
c5e8d7af 1358
70219b0f 1359 return self.playlist_result(_entries(), playlist_id, playlist_title)
c5e8d7af 1360
448830ce
S
1361 def _real_extract(self, url):
1362 # Extract playlist id
1363 mobj = re.match(self._VALID_URL, url)
1364 if mobj is None:
1365 raise ExtractorError('Invalid URL: %s' % url)
1366 playlist_id = mobj.group(1) or mobj.group(2)
1367
1368 # Check if it's a video-specific URL
1369 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1370 if 'v' in query_dict:
1371 video_id = query_dict['v'][0]
1372 if self._downloader.params.get('noplaylist'):
1373 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1374 return self.url_result(video_id, 'Youtube', video_id=video_id)
1375 else:
1376 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1377
1378 if playlist_id.startswith('RD') or playlist_id.startswith('UL'):
1379 # Mixes require a custom extraction process
1380 return self._extract_mix(playlist_id)
1381
1382 return self._extract_playlist(playlist_id)
1383
c5e8d7af
PH
1384
1385class YoutubeChannelIE(InfoExtractor):
78caa52a 1386 IE_DESC = 'YouTube.com channels'
9ff67727 1387 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
eb0f3e7e 1388 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
78caa52a 1389 IE_NAME = 'youtube:channel'
cdc628a4
PH
1390 _TESTS = [{
1391 'note': 'paginated channel',
1392 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1393 'playlist_mincount': 91,
acf757f4
PH
1394 'info_dict': {
1395 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
1396 }
cdc628a4 1397 }]
c5e8d7af 1398
6de5dbaf
S
1399 @staticmethod
1400 def extract_videos_from_page(page):
c5e8d7af 1401 ids_in_page = []
fb69240c
S
1402 titles_in_page = []
1403 for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page):
1404 video_id = mobj.group('id')
1405 video_title = unescapeHTML(mobj.group('title'))
1406 try:
1407 idx = ids_in_page.index(video_id)
1408 if video_title and not titles_in_page[idx]:
1409 titles_in_page[idx] = video_title
1410 except ValueError:
1411 ids_in_page.append(video_id)
1412 titles_in_page.append(video_title)
1413 return zip(ids_in_page, titles_in_page)
c5e8d7af
PH
1414
1415 def _real_extract(self, url):
9ff67727 1416 channel_id = self._match_id(url)
c5e8d7af 1417
eb0f3e7e 1418 url = self._TEMPLATE_URL % channel_id
386bdfa6
S
1419
1420 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
1421 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
1422 # otherwise fallback on channel by page extraction
1423 channel_page = self._download_webpage(
1424 url + '?view=57', channel_id,
1425 'Downloading channel page', fatal=False)
3d8e9573
S
1426 channel_playlist_id = self._html_search_meta(
1427 'channelId', channel_page, 'channel id', default=None)
1428 if not channel_playlist_id:
1429 channel_playlist_id = self._search_regex(
1430 r'data-channel-external-id="([^"]+)"',
1431 channel_page, 'channel id', default=None)
386bdfa6
S
1432 if channel_playlist_id and channel_playlist_id.startswith('UC'):
1433 playlist_id = 'UU' + channel_playlist_id[2:]
d2a9de78
IK
1434 return self.url_result(
1435 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
386bdfa6 1436
60bf45c8 1437 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
31812a9e
PH
1438 autogenerated = re.search(r'''(?x)
1439 class="[^"]*?(?:
1440 channel-header-autogenerated-label|
1441 yt-channel-title-autogenerated
1442 )[^"]*"''', channel_page) is not None
c5e8d7af 1443
b9643eed
JMF
1444 if autogenerated:
1445 # The videos are contained in a single page
1446 # the ajax pages can't be used, they are empty
b82f815f 1447 entries = [
fb69240c
S
1448 self.url_result(
1449 video_id, 'Youtube', video_id=video_id,
1450 video_title=video_title)
8f02ad4f 1451 for video_id, video_title in self.extract_videos_from_page(channel_page)]
b82f815f
PH
1452 return self.playlist_result(entries, channel_id)
1453
1454 def _entries():
23d3608c 1455 more_widget_html = content_html = channel_page
b9643eed 1456 for pagenum in itertools.count(1):
81c2f20b 1457
8f02ad4f 1458 for video_id, video_title in self.extract_videos_from_page(content_html):
b82f815f 1459 yield self.url_result(
fb69240c
S
1460 video_id, 'Youtube', video_id=video_id,
1461 video_title=video_title)
5f6a1245 1462
23d3608c
JMF
1463 mobj = re.search(
1464 r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
1465 more_widget_html)
1466 if not mobj:
b9643eed 1467 break
c5e8d7af 1468
23d3608c
JMF
1469 more = self._download_json(
1470 'https://youtube.com/%s' % mobj.group('more'), channel_id,
1471 'Downloading page #%s' % (pagenum + 1),
1472 transform_source=uppercase_escape)
1473 content_html = more['content_html']
1474 more_widget_html = more['load_more_widget_html']
1475
b82f815f 1476 return self.playlist_result(_entries(), channel_id)
c5e8d7af
PH
1477
1478
eb0f3e7e 1479class YoutubeUserIE(YoutubeChannelIE):
78caa52a 1480 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
9ff67727 1481 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
eb0f3e7e 1482 _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos'
78caa52a 1483 IE_NAME = 'youtube:user'
c5e8d7af 1484
cdc628a4
PH
1485 _TESTS = [{
1486 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1487 'playlist_mincount': 320,
1488 'info_dict': {
1489 'title': 'TheLinuxFoundation',
1490 }
1491 }, {
1492 'url': 'ytuser:phihag',
1493 'only_matching': True,
1494 }]
1495
e3ea4790 1496 @classmethod
f4b05232 1497 def suitable(cls, url):
e3ea4790
JMF
1498 # Don't return True if the url can be extracted with other youtube
1499 # extractor, the regex would is too permissive and it would match.
1500 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
5f6a1245
JW
1501 if any(ie.suitable(url) for ie in other_ies):
1502 return False
1503 else:
1504 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 1505
b05654f0 1506
b4c08069 1507class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
78caa52a 1508 IE_DESC = 'YouTube.com searches'
b4c08069
JMF
1509 # there doesn't appear to be a real limit, for example if you search for
1510 # 'python' you get more than 8.000.000 results
1511 _MAX_RESULTS = float('inf')
78caa52a 1512 IE_NAME = 'youtube:search'
b05654f0 1513 _SEARCH_KEY = 'ytsearch'
b4c08069 1514 _EXTRA_QUERY_ARGS = {}
9dd8e46a 1515 _TESTS = []
b05654f0 1516
b05654f0
PH
1517 def _get_n_results(self, query, n):
1518 """Get a specified number of results for a query"""
1519
b4c08069 1520 videos = []
b05654f0
PH
1521 limit = n
1522
b4c08069
JMF
1523 for pagenum in itertools.count(1):
1524 url_query = {
02175a79 1525 'search_query': query.encode('utf-8'),
b4c08069
JMF
1526 'page': pagenum,
1527 'spf': 'navigate',
1528 }
1529 url_query.update(self._EXTRA_QUERY_ARGS)
1530 result_url = 'https://www.youtube.com/results?' + compat_urllib_parse.urlencode(url_query)
1531 data = self._download_json(
69ea8ca4 1532 result_url, video_id='query "%s"' % query,
b4c08069 1533 note='Downloading page %s' % pagenum,
69ea8ca4 1534 errnote='Unable to download API page')
b4c08069 1535 html_content = data[1]['body']['content']
7cc3570e 1536
b4c08069 1537 if 'class="search-message' in html_content:
07ad22b8 1538 raise ExtractorError(
78caa52a 1539 '[youtube] No video results', expected=True)
b05654f0 1540
b4c08069
JMF
1541 new_videos = self._ids_to_results(orderedSet(re.findall(
1542 r'href="/watch\?v=(.{11})', html_content)))
1543 videos += new_videos
1544 if not new_videos or len(videos) > limit:
1545 break
b05654f0 1546
b4c08069
JMF
1547 if len(videos) > n:
1548 videos = videos[:n]
b05654f0 1549 return self.playlist_result(videos, query)
75dff0ee 1550
c9ae7b95 1551
a3dd9248 1552class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1553 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248 1554 _SEARCH_KEY = 'ytsearchdate'
78caa52a 1555 IE_DESC = 'YouTube.com searches, newest videos first'
b4c08069 1556 _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
75dff0ee 1557
c9ae7b95
PH
1558
1559class YoutubeSearchURLIE(InfoExtractor):
78caa52a
PH
1560 IE_DESC = 'YouTube.com search URLs'
1561 IE_NAME = 'youtube:search_url'
c9ae7b95 1562 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
1563 _TESTS = [{
1564 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1565 'playlist_mincount': 5,
1566 'info_dict': {
1567 'title': 'youtube-dl test video',
1568 }
1569 }]
c9ae7b95
PH
1570
1571 def _real_extract(self, url):
1572 mobj = re.match(self._VALID_URL, url)
1573 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1574
1575 webpage = self._download_webpage(url, query)
1576 result_code = self._search_regex(
98998cde 1577 r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML')
c9ae7b95
PH
1578
1579 part_codes = re.findall(
1580 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1581 entries = []
1582 for part_code in part_codes:
1583 part_title = self._html_search_regex(
6feb2d5e 1584 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1585 part_url_snippet = self._html_search_regex(
1586 r'(?s)href="([^"]+)"', part_code, 'item URL')
1587 part_url = compat_urlparse.urljoin(
1588 'https://www.youtube.com/', part_url_snippet)
1589 entries.append({
1590 '_type': 'url',
1591 'url': part_url,
1592 'title': part_title,
1593 })
1594
1595 return {
1596 '_type': 'playlist',
1597 'entries': entries,
1598 'title': query,
1599 }
1600
1601
75dff0ee 1602class YoutubeShowIE(InfoExtractor):
78caa52a 1603 IE_DESC = 'YouTube.com (multi-season) shows'
cdc628a4 1604 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 1605 IE_NAME = 'youtube:show'
cdc628a4
PH
1606 _TESTS = [{
1607 'url': 'http://www.youtube.com/show/airdisasters',
1608 'playlist_mincount': 3,
1609 'info_dict': {
1610 'id': 'airdisasters',
1611 'title': 'Air Disasters',
1612 }
1613 }]
75dff0ee
JMF
1614
1615 def _real_extract(self, url):
1616 mobj = re.match(self._VALID_URL, url)
cdc628a4
PH
1617 playlist_id = mobj.group('id')
1618 webpage = self._download_webpage(
1619 url, playlist_id, 'Downloading show webpage')
75dff0ee
JMF
1620 # There's one playlist for each season of the show
1621 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
cdc628a4
PH
1622 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1623 entries = [
1624 self.url_result(
1625 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1626 for season in m_seasons
1627 ]
1628 title = self._og_search_title(webpage, fatal=False)
1629
1630 return {
1631 '_type': 'playlist',
1632 'id': playlist_id,
1633 'title': title,
1634 'entries': entries,
1635 }
04cc9617
JMF
1636
1637
b2e8bc1b 1638class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639 1639 """
25f14e9f 1640 Base class for feed extractors
d7ae0639
JMF
1641 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1642 """
b2e8bc1b 1643 _LOGIN_REQUIRED = True
d7ae0639
JMF
1644
1645 @property
1646 def IE_NAME(self):
78caa52a 1647 return 'youtube:%s' % self._FEED_NAME
04cc9617 1648
81f0259b 1649 def _real_initialize(self):
b2e8bc1b 1650 self._login()
81f0259b 1651
04cc9617 1652 def _real_extract(self, url):
25f14e9f
S
1653 page = self._download_webpage(
1654 'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)
2bc43303
JMF
1655
1656 # The extraction process is the same as for playlists, but the regex
1657 # for the video ids doesn't contain an index
1658 ids = []
1659 more_widget_html = content_html = page
2bc43303
JMF
1660 for page_num in itertools.count(1):
1661 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
62c95fd5
S
1662
1663 # 'recommended' feed has infinite 'load more' and each new portion spins
1664 # the same videos in (sometimes) slightly different order, so we'll check
1665 # for unicity and break when portion has no new videos
1666 new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))
1667 if not new_ids:
1668 break
1669
2bc43303
JMF
1670 ids.extend(new_ids)
1671
1672 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1673 if not mobj:
1674 break
1675
1676 more = self._download_json(
25f14e9f 1677 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
2bc43303
JMF
1678 'Downloading page #%s' % page_num,
1679 transform_source=uppercase_escape)
1680 content_html = more['content_html']
1681 more_widget_html = more['load_more_widget_html']
1682
25f14e9f
S
1683 return self.playlist_result(
1684 self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE)
1685
1686
1687class YoutubeWatchLaterIE(YoutubePlaylistIE):
1688 IE_NAME = 'youtube:watchlater'
1689 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
1690 _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'
1691
1692 _TESTS = [] # override PlaylistIE tests
1693
1694 def _real_extract(self, url):
1695 return self._extract_playlist('WL')
f459d170 1696
5f6a1245 1697
c626a3d9 1698class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 1699 IE_NAME = 'youtube:favorites'
f3a34072 1700 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
c7a7750d 1701 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1702 _LOGIN_REQUIRED = True
1703
1704 def _real_extract(self, url):
1705 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 1706 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 1707 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1708
1709
25f14e9f
S
1710class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1711 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
1712 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1713 _FEED_NAME = 'recommended'
1714 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1ed5b5c9 1715
1ed5b5c9 1716
25f14e9f
S
1717class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1718 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1719 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1720 _FEED_NAME = 'subscriptions'
1721 _PLAYLIST_TITLE = 'Youtube Subscriptions'
1ed5b5c9 1722
1ed5b5c9 1723
25f14e9f
S
1724class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1725 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
1726 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1727 _FEED_NAME = 'history'
1728 _PLAYLIST_TITLE = 'Youtube History'
1ed5b5c9
JMF
1729
1730
15870e90
PH
1731class YoutubeTruncatedURLIE(InfoExtractor):
1732 IE_NAME = 'youtube:truncated_url'
1733 IE_DESC = False # Do not list
975d35db 1734 _VALID_URL = r'''(?x)
b95aab84
PH
1735 (?:https?://)?
1736 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
1737 (?:watch\?(?:
c4808c60 1738 feature=[a-z_]+|
b95aab84
PH
1739 annotation_id=annotation_[^&]+|
1740 x-yt-cl=[0-9]+|
c1708b89 1741 hl=[^&]*|
b95aab84
PH
1742 )?
1743 |
1744 attribution_link\?a=[^&]+
1745 )
1746 $
975d35db 1747 '''
15870e90 1748
c4808c60
PH
1749 _TESTS = [{
1750 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1751 'only_matching': True,
dc2fc736
PH
1752 }, {
1753 'url': 'http://www.youtube.com/watch?',
1754 'only_matching': True,
b95aab84
PH
1755 }, {
1756 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
1757 'only_matching': True,
1758 }, {
1759 'url': 'https://www.youtube.com/watch?feature=foo',
1760 'only_matching': True,
c1708b89
PH
1761 }, {
1762 'url': 'https://www.youtube.com/watch?hl=en-GB',
1763 'only_matching': True,
c4808c60
PH
1764 }]
1765
15870e90
PH
1766 def _real_extract(self, url):
1767 raise ExtractorError(
78caa52a
PH
1768 'Did you forget to quote the URL? Remember that & is a meta '
1769 'character in most shells, so you want to put the URL in quotes, '
1770 'like youtube-dl '
1771 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1772 ' or simply youtube-dl BaW_jenozKc .',
15870e90 1773 expected=True)
772fd5cc
PH
1774
1775
1776class YoutubeTruncatedIDIE(InfoExtractor):
1777 IE_NAME = 'youtube:truncated_id'
1778 IE_DESC = False # Do not list
b95aab84 1779 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
1780
1781 _TESTS = [{
1782 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
1783 'only_matching': True,
1784 }]
1785
1786 def _real_extract(self, url):
1787 video_id = self._match_id(url)
1788 raise ExtractorError(
1789 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
1790 expected=True)