]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[downloader/dash] Add testing facility
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
42939b61 10import time
e0df6211 11import traceback
c5e8d7af 12
b05654f0 13from .common import InfoExtractor, SearchInfoExtractor
2b25cb5d 14from ..jsinterp import JSInterpreter
54256267 15from ..swfinterp import SWFInterpreter
4bb4a188 16from ..compat import (
edf3e38e 17 compat_chr,
c5e8d7af 18 compat_parse_qs,
c5e8d7af
PH
19 compat_urllib_parse,
20 compat_urllib_request,
7c61bd36 21 compat_urlparse,
c5e8d7af 22 compat_str,
4bb4a188
PH
23)
24from ..utils import (
c5e8d7af 25 clean_html,
c5e8d7af 26 ExtractorError,
2d30521a 27 float_or_none,
4bb4a188
PH
28 get_element_by_attribute,
29 get_element_by_id,
dd27fd17 30 int_or_none,
4bb4a188 31 orderedSet,
c5e8d7af
PH
32 unescapeHTML,
33 unified_strdate,
81c2f20b 34 uppercase_escape,
c5e8d7af
PH
35)
36
5f6a1245 37
de7f3446 38class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
39 """Provide base functions for Youtube extractors"""
40 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
83317f69 41 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
b2e8bc1b
JMF
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
45
b2e8bc1b 46 def _set_language(self):
810fb84d
PH
47 self._set_cookie(
48 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
42939b61 49 # YouTube sets the expire time to about two months
810fb84d 50 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b 51
25f14e9f
S
52 def _ids_to_results(self, ids):
53 return [
54 self.url_result(vid_id, 'Youtube', video_id=vid_id)
55 for vid_id in ids]
56
b2e8bc1b 57 def _login(self):
83317f69 58 """
59 Attempt to log in to YouTube.
60 True is returned if successful or skipped.
61 False is returned if login failed.
62
63 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
64 """
b2e8bc1b
JMF
65 (username, password) = self._get_login_info()
66 # No authentication to be performed
67 if username is None:
68 if self._LOGIN_REQUIRED:
69ea8ca4 69 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 70 return True
b2e8bc1b 71
7cc3570e
PH
72 login_page = self._download_webpage(
73 self._LOGIN_URL, None,
69ea8ca4
PH
74 note='Downloading login page',
75 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
76 if login_page is False:
77 return
b2e8bc1b 78
795f28f8 79 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78caa52a 80 login_page, 'Login GALX parameter')
c5e8d7af 81
b2e8bc1b
JMF
82 # Log in
83 login_form_strs = {
8bcc8756
JW
84 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
85 'Email': username,
86 'GALX': galx,
87 'Passwd': password,
88
89 'PersistentCookie': 'yes',
90 '_utf8': '霱',
91 'bgresponse': 'js_disabled',
92 'checkConnection': '',
93 'checkedDomains': 'youtube',
94 'dnConn': '',
95 'pstMsg': '0',
96 'rmShown': '1',
97 'secTok': '',
98 'signIn': 'Sign in',
99 'timeStmp': '',
100 'service': 'youtube',
101 'uilel': '3',
102 'hl': 'en_US',
b2e8bc1b 103 }
83317f69 104
b2e8bc1b
JMF
105 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
106 # chokes on unicode
5f6a1245 107 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
b2e8bc1b 108 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
109
110 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
111 login_results = self._download_webpage(
112 req, None,
69ea8ca4 113 note='Logging in', errnote='unable to log in', fatal=False)
7cc3570e
PH
114 if login_results is False:
115 return False
83317f69 116
117 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
69ea8ca4 118 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
83317f69 119
120 # Two-Factor
121 # TODO add SMS and phone call support - these require making a request and then prompting the user
122
123 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
124 tfa_code = self._get_tfa_info()
125
126 if tfa_code is None:
69ea8ca4
PH
127 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
128 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
83317f69 129 return False
130
131 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
132
133 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
134 if match is None:
69ea8ca4 135 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
83317f69 136 secTok = match.group(1)
137 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
138 if match is None:
69ea8ca4 139 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
83317f69 140 timeStmp = match.group(1)
141
142 tfa_form_strs = {
78caa52a
PH
143 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
144 'smsToken': '',
145 'smsUserPin': tfa_code,
146 'smsVerifyPin': 'Verify',
147
148 'PersistentCookie': 'yes',
149 'checkConnection': '',
150 'checkedDomains': 'youtube',
151 'pstMsg': '1',
152 'secTok': secTok,
153 'timeStmp': timeStmp,
154 'service': 'youtube',
155 'hl': 'en_US',
83317f69 156 }
5f6a1245 157 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
83317f69 158 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
159
160 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
161 tfa_results = self._download_webpage(
162 tfa_req, None,
69ea8ca4 163 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
83317f69 164
165 if tfa_results is False:
166 return False
167
168 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
69ea8ca4 169 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
83317f69 170 return False
171 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
69ea8ca4 172 self._downloader.report_warning('unable to log in - did the page structure change?')
83317f69 173 return False
174 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
69ea8ca4 175 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
83317f69 176 return False
177
7cc3570e 178 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
69ea8ca4 179 self._downloader.report_warning('unable to log in: bad username or password')
b2e8bc1b
JMF
180 return False
181 return True
182
b2e8bc1b
JMF
183 def _real_initialize(self):
184 if self._downloader is None:
185 return
42939b61 186 self._set_language()
b2e8bc1b
JMF
187 if not self._login():
188 return
c5e8d7af 189
8377574c 190
360e1ca5 191class YoutubeIE(YoutubeBaseInfoExtractor):
78caa52a 192 IE_DESC = 'YouTube.com'
cb7dfeea 193 _VALID_URL = r"""(?x)^
c5e8d7af 194 (
edb53e2d 195 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 196 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 197 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 198 (?:www\.)?pwnyoutube\.com/|
f7000f3a 199 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
200 tube\.majestyc\.net/|
201 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
202 (?:.*?\#/)? # handle anchor (#/) redirect urls
203 (?: # the various things that can precede the ID:
ac7553d0 204 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 205 |(?: # or the v= param in all its forms
f7000f3a 206 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
207 (?:\?|\#!?) # the params delimiter ? or # or #!
208 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
209 v=
210 )
f4b05232
JMF
211 ))
212 |youtu\.be/ # just youtu.be/xxxx
edb53e2d 213 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 214 )
c5e8d7af 215 )? # all until now is optional -> you can pass the naked ID
8963d9c2 216 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
9291475f 217 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
c5e8d7af
PH
218 (?(1).+)? # if we found the ID, everything can follow
219 $"""
c5e8d7af 220 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
221 _formats = {
222 '5': {'ext': 'flv', 'width': 400, 'height': 240},
223 '6': {'ext': 'flv', 'width': 450, 'height': 270},
224 '13': {'ext': '3gp'},
225 '17': {'ext': '3gp', 'width': 176, 'height': 144},
226 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
227 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
228 '34': {'ext': 'flv', 'width': 640, 'height': 360},
229 '35': {'ext': 'flv', 'width': 854, 'height': 480},
230 '36': {'ext': '3gp', 'width': 320, 'height': 240},
231 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
232 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
233 '43': {'ext': 'webm', 'width': 640, 'height': 360},
234 '44': {'ext': 'webm', 'width': 854, 'height': 480},
235 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
236 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
237
1d043b93 238
86fe61c8 239 # 3d videos
43b81eb9
PH
240 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
241 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
242 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
243 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
244 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
245 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
246 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 247
96fb5605 248 # Apple HTTP Live Streaming
43b81eb9
PH
249 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
250 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
251 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
252 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
253 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
254 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
255 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
256
257 # DASH mp4 video
43b81eb9
PH
258 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
259 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
260 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
261 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
262 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
e65566a9 263 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
43b81eb9
PH
264 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
265 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
0d2c1418
PH
266 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
267 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
268 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
836a086c 269
f6f1fc92 270 # Dash mp4 audio
62cd676c
PH
271 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
272 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
273 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
836a086c
AZ
274
275 # Dash webm
e75cafe9
A
276 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
277 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
278 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
279 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
280 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
281 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
1cc887cb 282 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
e75cafe9
A
283 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
284 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
285 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
286 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
287 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
288 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
289 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 290 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 291 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
a51d3aa0
PH
292 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
293 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
76b3c610 294 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
10a404c3 295 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
76b3c610 296 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
2c62dc26
PH
297
298 # Dash webm audio
55db73ef 299 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
e75cafe9 300 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d 301
0857baad
PH
302 # Dash webm audio with opus inside
303 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
304 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
305 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
306
ce6b9a2d
PH
307 # RTMP (unnamed)
308 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 309 }
836a086c 310
78caa52a 311 IE_NAME = 'youtube'
2eb88d95
PH
312 _TESTS = [
313 {
4bc3a23e
PH
314 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
315 'info_dict': {
316 'id': 'BaW_jenozKc',
317 'ext': 'mp4',
318 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
319 'uploader': 'Philipp Hagemeister',
320 'uploader_id': 'phihag',
321 'upload_date': '20121002',
322 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
323 'categories': ['Science & Technology'],
3e7c1224
PH
324 'like_count': int,
325 'dislike_count': int,
2eb88d95 326 }
0e853ca4 327 },
0e853ca4 328 {
4bc3a23e
PH
329 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
330 'note': 'Test generic use_cipher_signature video (#897)',
331 'info_dict': {
332 'id': 'UxxajLWwzqY',
333 'ext': 'mp4',
334 'upload_date': '20120506',
335 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
336 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
337 'uploader': 'Icona Pop',
338 'uploader_id': 'IconaPop',
2eb88d95 339 }
c108eb73
JMF
340 },
341 {
4bc3a23e
PH
342 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
343 'note': 'Test VEVO video with age protection (#956)',
344 'info_dict': {
345 'id': '07FYdnEawAQ',
346 'ext': 'mp4',
347 'upload_date': '20130703',
348 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
349 'description': 'md5:64249768eec3bc4276236606ea996373',
350 'uploader': 'justintimberlakeVEVO',
351 'uploader_id': 'justintimberlakeVEVO',
c108eb73
JMF
352 }
353 },
fccd3771 354 {
4bc3a23e
PH
355 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
356 'note': 'Embed-only video (#1746)',
357 'info_dict': {
358 'id': 'yZIXLfi8CZQ',
359 'ext': 'mp4',
360 'upload_date': '20120608',
361 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
362 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
363 'uploader': 'SET India',
364 'uploader_id': 'setindia'
fccd3771
PH
365 }
366 },
dd27fd17 367 {
4bc3a23e
PH
368 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
369 'note': '256k DASH audio (format 141) via DASH manifest',
370 'info_dict': {
371 'id': 'a9LDPn-MO4I',
372 'ext': 'm4a',
373 'upload_date': '20121002',
374 'uploader_id': '8KVIDEO',
375 'description': '',
376 'uploader': '8KVIDEO',
377 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 378 },
4bc3a23e
PH
379 'params': {
380 'youtube_include_dash_manifest': True,
381 'format': '141',
4919603f 382 },
dd27fd17 383 },
3489b7d2
JMF
384 # DASH manifest with encrypted signature
385 {
78caa52a
PH
386 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
387 'info_dict': {
388 'id': 'IB3lcPjvWLA',
389 'ext': 'm4a',
b766eb27
JMF
390 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
391 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
78caa52a
PH
392 'uploader': 'AfrojackVEVO',
393 'uploader_id': 'AfrojackVEVO',
394 'upload_date': '20131011',
3489b7d2 395 },
4bc3a23e 396 'params': {
78caa52a
PH
397 'youtube_include_dash_manifest': True,
398 'format': '141',
3489b7d2
JMF
399 },
400 },
aaeb86f6
S
401 # JS player signature function name containing $
402 {
403 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
404 'info_dict': {
405 'id': 'nfWlot6h_JM',
406 'ext': 'm4a',
407 'title': 'Taylor Swift - Shake It Off',
408 'description': 'md5:2acfda1b285bdd478ccec22f9918199d',
409 'uploader': 'TaylorSwiftVEVO',
410 'uploader_id': 'TaylorSwiftVEVO',
411 'upload_date': '20140818',
412 },
413 'params': {
414 'youtube_include_dash_manifest': True,
415 'format': '141',
416 },
417 },
aa79ac0c
PH
418 # Controversy video
419 {
420 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
421 'info_dict': {
422 'id': 'T4XJQO3qol8',
423 'ext': 'mp4',
424 'upload_date': '20100909',
425 'uploader': 'The Amazing Atheist',
426 'uploader_id': 'TheAmazingAtheist',
427 'title': 'Burning Everyone\'s Koran',
428 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
429 }
c522adb1
JMF
430 },
431 # Normal age-gate video (No vevo, embed allowed)
432 {
433 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
434 'info_dict': {
435 'id': 'HtVdAasjOgU',
436 'ext': 'mp4',
437 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
9ed99402 438 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
c522adb1
JMF
439 'uploader': 'The Witcher',
440 'uploader_id': 'WitcherGame',
441 'upload_date': '20140605',
442 },
443 },
fccae2b9
S
444 # Age-gate video with encrypted signature
445 {
446 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',
447 'info_dict': {
448 'id': '6kLq3WMV1nU',
449 'ext': 'mp4',
450 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
451 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
452 'uploader': 'LloydVEVO',
453 'uploader_id': 'LloydVEVO',
454 'upload_date': '20110629',
455 },
456 },
774e208f
PH
457 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
458 {
459 'url': '__2ABJjxzNo',
460 'info_dict': {
461 'id': '__2ABJjxzNo',
462 'ext': 'mp4',
463 'upload_date': '20100430',
464 'uploader_id': 'deadmau5',
465 'description': 'md5:12c56784b8032162bb936a5f76d55360',
466 'uploader': 'deadmau5',
467 'title': 'Deadmau5 - Some Chords (HD)',
468 },
469 'expected_warnings': [
470 'DASH manifest missing',
471 ]
e52a40ab
PH
472 },
473 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
474 {
475 'url': 'lqQg6PlCWgI',
476 'info_dict': {
477 'id': 'lqQg6PlCWgI',
478 'ext': 'mp4',
cbe2bd91
PH
479 'upload_date': '20120731',
480 'uploader_id': 'olympic',
481 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
482 'uploader': 'Olympics',
483 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
484 },
485 'params': {
486 'skip_download': 'requires avconv',
e52a40ab 487 }
cbe2bd91 488 },
6271f1ca
PH
489 # Non-square pixels
490 {
491 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
492 'info_dict': {
493 'id': '_b-2C3KPAM0',
494 'ext': 'mp4',
495 'stretched_ratio': 16 / 9.,
496 'upload_date': '20110310',
497 'uploader_id': 'AllenMeow',
498 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
499 'uploader': '孫艾倫',
500 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
501 },
06b491eb
S
502 },
503 # url_encoded_fmt_stream_map is empty string
504 {
505 'url': 'qEJwOuvDf7I',
506 'info_dict': {
507 'id': 'qEJwOuvDf7I',
508 'ext': 'mp4',
509 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
510 'description': '',
511 'upload_date': '20150404',
512 'uploader_id': 'spbelect',
513 'uploader': 'Наблюдатели Петербурга',
514 },
515 'params': {
516 'skip_download': 'requires avconv',
517 }
518 },
2eb88d95
PH
519 ]
520
e0df6211
PH
521 def __init__(self, *args, **kwargs):
522 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 523 self._player_cache = {}
e0df6211 524
c5e8d7af
PH
525 def report_video_info_webpage_download(self, video_id):
526 """Report attempt to download video info webpage."""
69ea8ca4 527 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 528
c5e8d7af
PH
529 def report_information_extraction(self, video_id):
530 """Report attempt to extract video information."""
69ea8ca4 531 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
532
533 def report_unavailable_format(self, video_id, format):
534 """Report extracted video URL."""
69ea8ca4 535 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
536
537 def report_rtmp_download(self):
538 """Indicate the download will use the RTMP protocol."""
69ea8ca4 539 self.to_screen('RTMP download detected')
c5e8d7af 540
60064c53
PH
541 def _signature_cache_id(self, example_sig):
542 """ Return a string representation of a signature """
78caa52a 543 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
544
545 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 546 id_m = re.match(
60620368 547 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
cf010131 548 player_url)
c081b35c
PH
549 if not id_m:
550 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
551 player_type = id_m.group('ext')
552 player_id = id_m.group('id')
553
c4417ddb 554 # Read from filesystem cache
60064c53
PH
555 func_id = '%s_%s_%s' % (
556 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 557 assert os.path.basename(func_id) == func_id
a0e07d31 558
69ea8ca4 559 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 560 if cache_spec is not None:
78caa52a 561 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 562
6d1a55a5
PH
563 download_note = (
564 'Downloading player %s' % player_url
565 if self._downloader.params.get('verbose') else
566 'Downloading %s player %s' % (player_type, player_id)
567 )
e0df6211
PH
568 if player_type == 'js':
569 code = self._download_webpage(
570 player_url, video_id,
6d1a55a5 571 note=download_note,
69ea8ca4 572 errnote='Download of %s failed' % player_url)
83799698 573 res = self._parse_sig_js(code)
c4417ddb 574 elif player_type == 'swf':
e0df6211
PH
575 urlh = self._request_webpage(
576 player_url, video_id,
6d1a55a5 577 note=download_note,
69ea8ca4 578 errnote='Download of %s failed' % player_url)
e0df6211 579 code = urlh.read()
83799698 580 res = self._parse_sig_swf(code)
e0df6211
PH
581 else:
582 assert False, 'Invalid player type %r' % player_type
583
785521bf
PH
584 test_string = ''.join(map(compat_chr, range(len(example_sig))))
585 cache_res = res(test_string)
586 cache_spec = [ord(c) for c in cache_res]
83799698 587
69ea8ca4 588 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
589 return res
590
60064c53 591 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
592 def gen_sig_code(idxs):
593 def _genslice(start, end, step):
78caa52a 594 starts = '' if start == 0 else str(start)
8bcc8756 595 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 596 steps = '' if step == 1 else (':%d' % step)
78caa52a 597 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
598
599 step = None
7af808a5
PH
600 # Quelch pyflakes warnings - start will be set when step is set
601 start = '(Never used)'
edf3e38e
PH
602 for i, prev in zip(idxs[1:], idxs[:-1]):
603 if step is not None:
604 if i - prev == step:
605 continue
606 yield _genslice(start, prev, step)
607 step = None
608 continue
609 if i - prev in [-1, 1]:
610 step = i - prev
611 start = prev
612 continue
613 else:
78caa52a 614 yield 's[%d]' % prev
edf3e38e 615 if step is None:
78caa52a 616 yield 's[%d]' % i
edf3e38e
PH
617 else:
618 yield _genslice(start, i, step)
619
78caa52a 620 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 621 cache_res = func(test_string)
edf3e38e 622 cache_spec = [ord(c) for c in cache_res]
78caa52a 623 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
624 signature_id_tuple = '(%s)' % (
625 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 626 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 627 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 628 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 629
e0df6211
PH
630 def _parse_sig_js(self, jscode):
631 funcname = self._search_regex(
aaeb86f6 632 r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode,
8bcc8756 633 'Initial JS player signature function name')
2b25cb5d
PH
634
635 jsi = JSInterpreter(jscode)
636 initial_function = jsi.extract_function(funcname)
e0df6211
PH
637 return lambda s: initial_function([s])
638
639 def _parse_sig_swf(self, file_contents):
54256267 640 swfi = SWFInterpreter(file_contents)
78caa52a 641 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 642 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 643 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
644 return lambda s: initial_function([s])
645
83799698 646 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 647 """Turn the encrypted s field into a working signature"""
6b37f0be 648
c8bf86d5 649 if player_url is None:
69ea8ca4 650 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 651
69ea8ca4 652 if player_url.startswith('//'):
78caa52a 653 player_url = 'https:' + player_url
c8bf86d5 654 try:
62af3a0e 655 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
656 if player_id not in self._player_cache:
657 func = self._extract_signature_function(
60064c53 658 video_id, player_url, s
c8bf86d5
PH
659 )
660 self._player_cache[player_id] = func
661 func = self._player_cache[player_id]
662 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 663 self._print_sig_code(func, s)
c8bf86d5
PH
664 return func(s)
665 except Exception as e:
666 tb = traceback.format_exc()
667 raise ExtractorError(
78caa52a 668 'Signature extraction failed: ' + tb, cause=e)
e0df6211 669
360e1ca5 670 def _get_subtitles(self, video_id, webpage):
de7f3446 671 try:
60e47a26 672 subs_doc = self._download_xml(
38c2e5b8 673 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
674 video_id, note=False)
675 except ExtractorError as err:
69ea8ca4 676 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
de7f3446 677 return {}
de7f3446
JMF
678
679 sub_lang_list = {}
60e47a26
JMF
680 for track in subs_doc.findall('track'):
681 lang = track.attrib['lang_code']
7e660ac1
LD
682 if lang in sub_lang_list:
683 continue
360e1ca5
JMF
684 sub_formats = []
685 for ext in ['sbv', 'vtt', 'srt']:
686 params = compat_urllib_parse.urlencode({
687 'lang': lang,
688 'v': video_id,
689 'fmt': ext,
690 'name': track.attrib['name'].encode('utf-8'),
691 })
692 sub_formats.append({
693 'url': 'https://www.youtube.com/api/timedtext?' + params,
694 'ext': ext,
695 })
696 sub_lang_list[lang] = sub_formats
de7f3446 697 if not sub_lang_list:
69ea8ca4 698 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
699 return {}
700 return sub_lang_list
701
360e1ca5 702 def _get_automatic_captions(self, video_id, webpage):
de7f3446
JMF
703 """We need the webpage for getting the captions url, pass it as an
704 argument to speed up the process."""
69ea8ca4 705 self.to_screen('%s: Looking for automatic captions' % video_id)
de7f3446 706 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
78caa52a 707 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
708 if mobj is None:
709 self._downloader.report_warning(err_msg)
710 return {}
711 player_config = json.loads(mobj.group(1))
712 try:
0792d563
PH
713 args = player_config['args']
714 caption_url = args['ttsurl']
715 timestamp = args['timestamp']
055e6f36
JMF
716 # We get the available subtitles
717 list_params = compat_urllib_parse.urlencode({
718 'type': 'list',
719 'tlangs': 1,
720 'asrs': 1,
de7f3446 721 })
055e6f36 722 list_url = caption_url + '&' + list_params
e26f8712 723 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 724 original_lang_node = caption_list.find('track')
7d900ef1 725 if original_lang_node is None:
69ea8ca4 726 self._downloader.report_warning('Video doesn\'t have automatic captions')
e3dc22ca
JMF
727 return {}
728 original_lang = original_lang_node.attrib['lang_code']
7d900ef1 729 caption_kind = original_lang_node.attrib.get('kind', '')
055e6f36
JMF
730
731 sub_lang_list = {}
732 for lang_node in caption_list.findall('target'):
733 sub_lang = lang_node.attrib['lang_code']
360e1ca5
JMF
734 sub_formats = []
735 for ext in ['sbv', 'vtt', 'srt']:
736 params = compat_urllib_parse.urlencode({
737 'lang': original_lang,
738 'tlang': sub_lang,
739 'fmt': ext,
740 'ts': timestamp,
741 'kind': caption_kind,
742 })
743 sub_formats.append({
744 'url': caption_url + '&' + params,
745 'ext': ext,
746 })
747 sub_lang_list[sub_lang] = sub_formats
055e6f36 748 return sub_lang_list
de7f3446
JMF
749 # An extractor error can be raise by the download process if there are
750 # no automatic captions but there are subtitles
751 except (KeyError, ExtractorError):
752 self._downloader.report_warning(err_msg)
753 return {}
754
97665381
PH
755 @classmethod
756 def extract_id(cls, url):
757 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 758 if mobj is None:
69ea8ca4 759 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
760 video_id = mobj.group(2)
761 return video_id
762
1d043b93
JMF
763 def _extract_from_m3u8(self, manifest_url, video_id):
764 url_map = {}
5f6a1245 765
1d043b93
JMF
766 def _get_urls(_manifest):
767 lines = _manifest.split('\n')
768 urls = filter(lambda l: l and not l.startswith('#'),
8bcc8756 769 lines)
1d043b93 770 return urls
78caa52a 771 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
1d043b93
JMF
772 formats_urls = _get_urls(manifest)
773 for format_url in formats_urls:
890f62e8 774 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
775 url_map[itag] = format_url
776 return url_map
777
1fb07d10
JG
778 def _extract_annotations(self, video_id):
779 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 780 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 781
da276600
PH
782 def _parse_dash_manifest(
783 self, video_id, dash_manifest_url, player_url, age_gate):
774e208f
PH
784 def decrypt_sig(mobj):
785 s = mobj.group(1)
786 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
787 return '/signature/%s' % dec_s
788 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
789 dash_doc = self._download_xml(
790 dash_manifest_url, video_id,
791 note='Downloading DASH manifest',
792 errnote='Could not download DASH manifest')
793
794 formats = []
de5c5456
YCH
795 for a in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}AdaptationSet'):
796 mime_type = a.attrib.get('mimeType')
797 for r in a.findall('{urn:mpeg:DASH:schema:MPD:2011}Representation'):
798 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
799 if url_el is None:
800 continue
801 if mime_type == 'text/vtt':
802 # TODO implement WebVTT downloading
803 pass
804 elif mime_type.startswith('audio/') or mime_type.startswith('video/'):
6800d337 805 segment_list = r.find('{urn:mpeg:DASH:schema:MPD:2011}SegmentList')
de5c5456
YCH
806 format_id = r.attrib['id']
807 video_url = url_el.text
808 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
809 f = {
810 'format_id': format_id,
811 'url': video_url,
812 'width': int_or_none(r.attrib.get('width')),
813 'height': int_or_none(r.attrib.get('height')),
814 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
815 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
816 'filesize': filesize,
817 'fps': int_or_none(r.attrib.get('frameRate')),
818 }
0c8662d2 819 if segment_list is not None:
6800d337
YCH
820 f.update({
821 'initialization_url': segment_list.find('{urn:mpeg:DASH:schema:MPD:2011}Initialization').attrib['sourceURL'],
b9258c61 822 'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall('{urn:mpeg:DASH:schema:MPD:2011}SegmentURL')],
423d2be5 823 'protocol': 'http_dash_segments',
6800d337 824 })
de5c5456
YCH
825 try:
826 existing_format = next(
827 fo for fo in formats
828 if fo['format_id'] == format_id)
829 except StopIteration:
830 full_info = self._formats.get(format_id, {}).copy()
831 full_info.update(f)
832 formats.append(full_info)
833 else:
834 existing_format.update(f)
835 else:
836 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
774e208f
PH
837 return formats
838
c5e8d7af 839 def _real_extract(self, url):
7e8c0af0 840 proto = (
78caa52a
PH
841 'http' if self._downloader.params.get('prefer_insecure', False)
842 else 'https')
7e8c0af0 843
c5e8d7af
PH
844 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
845 mobj = re.search(self._NEXT_URL_RE, url)
846 if mobj:
7e8c0af0 847 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
97665381 848 video_id = self.extract_id(url)
c5e8d7af
PH
849
850 # Get video webpage
aa79ac0c 851 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
a1f934b1 852 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
853
854 # Attempt to extract SWF player URL
e0df6211 855 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
856 if mobj is not None:
857 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
858 else:
859 player_url = None
860
861 # Get video info
6449cd80 862 embed_webpage = None
c108eb73 863 if re.search(r'player-age-gate-content">', video_webpage) is not None:
c108eb73
JMF
864 age_gate = True
865 # We simulate the access to the video from www.youtube.com/v/{video_id}
866 # this can be viewed without login into Youtube
beb95e77
CL
867 url = proto + '://www.youtube.com/embed/%s' % video_id
868 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
2c57c7fa
JMF
869 data = compat_urllib_parse.urlencode({
870 'video_id': video_id,
871 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934 872 'sts': self._search_regex(
beb95e77 873 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
2c57c7fa 874 })
7e8c0af0 875 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
94bd3613
PH
876 video_info_webpage = self._download_webpage(
877 video_info_url, video_id,
20436c30 878 note='Refetching age-gated info webpage',
94bd3613 879 errnote='unable to download video info webpage')
c5e8d7af 880 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
881 else:
882 age_gate = False
4e62ebe2
JMF
883 try:
884 # Try looking directly into the video webpage
885 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
886 if not mobj:
887 raise ValueError('Could not find ytplayer.config') # caught below
888 json_code = uppercase_escape(mobj.group(1))
889 ytplayer_config = json.loads(json_code)
890 args = ytplayer_config['args']
891 # Convert to the same format returned by compat_parse_qs
892 video_info = dict((k, [v]) for k, v in args.items())
e40bd5f0 893 if not args.get('url_encoded_fmt_stream_map'):
4e62ebe2
JMF
894 raise ValueError('No stream_map present') # caught below
895 except ValueError:
896 # We fallback to the get_video_info pages (used by the embed page)
897 self.report_video_info_webpage_download(video_id)
898 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
810fb84d
PH
899 video_info_url = (
900 '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
901 % (proto, video_id, el_type))
902 video_info_webpage = self._download_webpage(
903 video_info_url,
4e62ebe2
JMF
904 video_id, note=False,
905 errnote='unable to download video info webpage')
906 video_info = compat_parse_qs(video_info_webpage)
907 if 'token' in video_info:
908 break
c5e8d7af
PH
909 if 'token' not in video_info:
910 if 'reason' in video_info:
d11271dd 911 raise ExtractorError(
78caa52a 912 'YouTube said: %s' % video_info['reason'][0],
d11271dd 913 expected=True, video_id=video_id)
c5e8d7af 914 else:
d11271dd 915 raise ExtractorError(
78caa52a 916 '"token" parameter not in video info for unknown reason',
d11271dd 917 video_id=video_id)
c5e8d7af 918
1d699755
PH
919 if 'view_count' in video_info:
920 view_count = int(video_info['view_count'][0])
921 else:
922 view_count = None
923
c5e8d7af
PH
924 # Check for "rental" videos
925 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
69ea8ca4 926 raise ExtractorError('"rental" videos not supported')
c5e8d7af
PH
927
928 # Start extracting information
929 self.report_information_extraction(video_id)
930
931 # uploader
932 if 'author' not in video_info:
69ea8ca4 933 raise ExtractorError('Unable to extract uploader name')
c5e8d7af
PH
934 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
935
936 # uploader_id
937 video_uploader_id = None
938 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
939 if mobj is not None:
940 video_uploader_id = mobj.group(1)
941 else:
69ea8ca4 942 self._downloader.report_warning('unable to extract uploader nickname')
c5e8d7af
PH
943
944 # title
a8c6b241 945 if 'title' in video_info:
aa92f063 946 video_title = video_info['title'][0]
a8c6b241 947 else:
69ea8ca4 948 self._downloader.report_warning('Unable to extract video title')
78caa52a 949 video_title = '_'
c5e8d7af
PH
950
951 # thumbnail image
7763b04e
JMF
952 # We try first to get a high quality image:
953 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
954 video_webpage, re.DOTALL)
955 if m_thumb is not None:
956 video_thumbnail = m_thumb.group(1)
957 elif 'thumbnail_url' not in video_info:
69ea8ca4 958 self._downloader.report_warning('unable to extract video thumbnail')
f490e77e 959 video_thumbnail = None
c5e8d7af
PH
960 else: # don't panic if we can't find it
961 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
962
963 # upload date
964 upload_date = None
ad3bc6ac 965 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
beee53de
PH
966 if mobj is None:
967 mobj = re.search(
263bd4ec 968 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
beee53de 969 video_webpage)
c5e8d7af
PH
970 if mobj is not None:
971 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
972 upload_date = unified_strdate(upload_date)
973
55f7bd2d
PH
974 m_cat_container = self._search_regex(
975 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
624dcebf 976 video_webpage, 'categories', default=None)
ec8deefc 977 if m_cat_container:
ad3bc6ac 978 category = self._html_search_regex(
01ed5c9b 979 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
980 default=None)
981 video_categories = None if category is None else [category]
982 else:
983 video_categories = None
ec8deefc 984
c5e8d7af
PH
985 # description
986 video_description = get_element_by_id("eow-description", video_webpage)
987 if video_description:
27dcce19
PH
988 video_description = re.sub(r'''(?x)
989 <a\s+
990 (?:[a-zA-Z-]+="[^"]+"\s+)*?
991 title="([^"]+)"\s+
992 (?:[a-zA-Z-]+="[^"]+"\s+)*?
993 class="yt-uix-redirect-link"\s*>
994 [^<]+
995 </a>
996 ''', r'\1', video_description)
c5e8d7af
PH
997 video_description = clean_html(video_description)
998 else:
999 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1000 if fd_mobj:
1001 video_description = unescapeHTML(fd_mobj.group(1))
1002 else:
78caa52a 1003 video_description = ''
c5e8d7af 1004
f30a38be 1005 def _extract_count(count_name):
46374a56 1006 count = self._search_regex(
f30a38be
JMF
1007 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
1008 video_webpage, count_name, default=None)
336c3a69
JMF
1009 if count is not None:
1010 return int(count.replace(',', ''))
1011 return None
69ea8ca4
PH
1012 like_count = _extract_count('like')
1013 dislike_count = _extract_count('dislike')
336c3a69 1014
c5e8d7af 1015 # subtitles
d82134c3 1016 video_subtitles = self.extract_subtitles(video_id, video_webpage)
360e1ca5 1017 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
c5e8d7af
PH
1018
1019 if 'length_seconds' not in video_info:
69ea8ca4 1020 self._downloader.report_warning('unable to extract video duration')
b466b702 1021 video_duration = None
c5e8d7af 1022 else:
b466b702 1023 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1024
1fb07d10
JG
1025 # annotations
1026 video_annotations = None
1027 if self._downloader.params.get('writeannotations', False):
5f6a1245 1028 video_annotations = self._extract_annotations(video_id)
1fb07d10 1029
dd27fd17
PH
1030 def _map_to_format_list(urlmap):
1031 formats = []
1032 for itag, video_real_url in urlmap.items():
1033 dct = {
1034 'format_id': itag,
1035 'url': video_real_url,
1036 'player_url': player_url,
1037 }
0b65e5d4
PH
1038 if itag in self._formats:
1039 dct.update(self._formats[itag])
dd27fd17
PH
1040 formats.append(dct)
1041 return formats
1042
c5e8d7af
PH
1043 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1044 self.report_rtmp_download()
dd27fd17
PH
1045 formats = [{
1046 'format_id': '_rtmp',
1047 'protocol': 'rtmp',
1048 'url': video_info['conn'][0],
1049 'player_url': player_url,
1050 }]
24270b03 1051 elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
5f6a1245 1052 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 1053 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1054 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1055 url_map = {}
00fe14fc 1056 for url_data_str in encoded_url_map.split(','):
c5e8d7af 1057 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
1058 if 'itag' not in url_data or 'url' not in url_data:
1059 continue
1060 format_id = url_data['itag'][0]
1061 url = url_data['url'][0]
1062
1063 if 'sig' in url_data:
1064 url += '&signature=' + url_data['sig'][0]
1065 elif 's' in url_data:
1066 encrypted_sig = url_data['s'][0]
6449cd80 1067 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
201e9eaa 1068
beb95e77 1069 jsplayer_url_json = self._search_regex(
6449cd80
PH
1070 ASSETS_RE,
1071 embed_webpage if age_gate else video_webpage,
1072 'JS player URL (1)', default=None)
1073 if not jsplayer_url_json and not age_gate:
1074 # We need the embed website after all
1075 if embed_webpage is None:
1076 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1077 embed_webpage = self._download_webpage(
1078 embed_url, video_id, 'Downloading embed webpage')
1079 jsplayer_url_json = self._search_regex(
1080 ASSETS_RE, embed_webpage, 'JS player URL')
1081
beb95e77 1082 player_url = json.loads(jsplayer_url_json)
201e9eaa
PH
1083 if player_url is None:
1084 player_url_json = self._search_regex(
1085 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 1086 video_webpage, 'age gate player URL')
201e9eaa
PH
1087 player_url = json.loads(player_url_json)
1088
1089 if self._downloader.params.get('verbose'):
cf010131 1090 if player_url is None:
201e9eaa
PH
1091 player_version = 'unknown'
1092 player_desc = 'unknown'
1093 else:
1094 if player_url.endswith('swf'):
1095 player_version = self._search_regex(
1096 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 1097 'flash player', fatal=False)
201e9eaa 1098 player_desc = 'flash player %s' % player_version
cf010131 1099 else:
201e9eaa
PH
1100 player_version = self._search_regex(
1101 r'html5player-([^/]+?)(?:/html5player)?\.js',
1102 player_url,
1103 'html5 player', fatal=False)
78caa52a 1104 player_desc = 'html5 player %s' % player_version
201e9eaa 1105
60064c53 1106 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 1107 self.to_screen('{%s} signature length %s, %s' %
9e1a5b84 1108 (format_id, parts_sizes, player_desc))
201e9eaa
PH
1109
1110 signature = self._decrypt_signature(
1111 encrypted_sig, video_id, player_url, age_gate)
1112 url += '&signature=' + signature
1113 if 'ratebypass' not in url:
1114 url += '&ratebypass=yes'
1115 url_map[format_id] = url
dd27fd17 1116 formats = _map_to_format_list(url_map)
1d043b93
JMF
1117 elif video_info.get('hlsvp'):
1118 manifest_url = video_info['hlsvp'][0]
1119 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1120 formats = _map_to_format_list(url_map)
c5e8d7af 1121 else:
69ea8ca4 1122 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1123
dd27fd17 1124 # Look for the DASH manifest
203fb43f 1125 if self._downloader.params.get('youtube_include_dash_manifest', True):
774e208f 1126 dash_mpd = video_info.get('dashmpd')
75111274 1127 if dash_mpd:
774e208f
PH
1128 dash_manifest_url = dash_mpd[0]
1129 try:
1130 dash_formats = self._parse_dash_manifest(
da276600 1131 video_id, dash_manifest_url, player_url, age_gate)
774e208f
PH
1132 except (ExtractorError, KeyError) as e:
1133 self.report_warning(
1134 'Skipping DASH manifest: %r' % e, video_id)
1135 else:
04b3b3df
JMF
1136 # Remove the formats we found through non-DASH, they
1137 # contain less info and it can be wrong, because we use
1138 # fixed values (for example the resolution). See
1139 # https://github.com/rg3/youtube-dl/issues/5774 for an
1140 # example.
e65566a9 1141 dash_keys = set(df['format_id'] for df in dash_formats)
04b3b3df 1142 formats = [f for f in formats if f['format_id'] not in dash_keys]
774e208f 1143 formats.extend(dash_formats)
d80044c2 1144
6271f1ca
PH
1145 # Check for malformed aspect ratio
1146 stretched_m = re.search(
1147 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
1148 video_webpage)
1149 if stretched_m:
1150 ratio = float(stretched_m.group('w')) / float(stretched_m.group('h'))
1151 for f in formats:
1152 if f.get('vcodec') != 'none':
1153 f['stretched_ratio'] = ratio
1154
4bcc7bd1 1155 self._sort_formats(formats)
4ea3be0a 1156
1157 return {
8bcc8756
JW
1158 'id': video_id,
1159 'uploader': video_uploader,
1160 'uploader_id': video_uploader_id,
1161 'upload_date': upload_date,
1162 'title': video_title,
1163 'thumbnail': video_thumbnail,
1164 'description': video_description,
1165 'categories': video_categories,
1166 'subtitles': video_subtitles,
360e1ca5 1167 'automatic_captions': automatic_captions,
8bcc8756
JW
1168 'duration': video_duration,
1169 'age_limit': 18 if age_gate else 0,
1170 'annotations': video_annotations,
7e8c0af0 1171 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 1172 'view_count': view_count,
4ea3be0a 1173 'like_count': like_count,
1174 'dislike_count': dislike_count,
2d30521a 1175 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
8bcc8756 1176 'formats': formats,
4ea3be0a 1177 }
c5e8d7af 1178
5f6a1245 1179
880e1c52 1180class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
78caa52a 1181 IE_DESC = 'YouTube.com playlists'
d67cc9fa 1182 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1183 (?:https?://)?
1184 (?:\w+\.)?
1185 youtube\.com/
1186 (?:
ac7553d0 1187 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
c5e8d7af
PH
1188 \? (?:.*?&)*? (?:p|a|list)=
1189 | p/
1190 )
d67cc9fa 1191 (
99209c29 1192 (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,}
5f6a1245 1193 # Top tracks, they can also include dots
d67cc9fa
JMF
1194 |(?:MC)[\w\.]*
1195 )
c5e8d7af
PH
1196 .*
1197 |
99209c29 1198 ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})
c5e8d7af 1199 )"""
dbb94fb0 1200 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dbb94fb0 1201 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
78caa52a 1202 IE_NAME = 'youtube:playlist'
81127aa5
PH
1203 _TESTS = [{
1204 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1205 'info_dict': {
1206 'title': 'ytdl test PL',
a1cf99d0 1207 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
81127aa5
PH
1208 },
1209 'playlist_count': 3,
9291475f
PH
1210 }, {
1211 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1212 'info_dict': {
acf757f4 1213 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
9291475f
PH
1214 'title': 'YDL_Empty_List',
1215 },
1216 'playlist_count': 0,
1217 }, {
1218 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1219 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1220 'info_dict': {
1221 'title': '29C3: Not my department',
acf757f4 1222 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
9291475f
PH
1223 },
1224 'playlist_count': 95,
1225 }, {
1226 'note': 'issue #673',
1227 'url': 'PLBB231211A4F62143',
1228 'info_dict': {
f46a8702 1229 'title': '[OLD]Team Fortress 2 (Class-based LP)',
acf757f4 1230 'id': 'PLBB231211A4F62143',
9291475f
PH
1231 },
1232 'playlist_mincount': 26,
1233 }, {
1234 'note': 'Large playlist',
1235 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1236 'info_dict': {
1237 'title': 'Uploads from Cauchemar',
acf757f4 1238 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
9291475f
PH
1239 },
1240 'playlist_mincount': 799,
1241 }, {
1242 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1243 'info_dict': {
1244 'title': 'YDL_safe_search',
acf757f4 1245 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
9291475f
PH
1246 },
1247 'playlist_count': 2,
ac7553d0
PH
1248 }, {
1249 'note': 'embedded',
1250 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1251 'playlist_count': 4,
1252 'info_dict': {
1253 'title': 'JODA15',
acf757f4 1254 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
ac7553d0 1255 }
6b08cdf6
PH
1256 }, {
1257 'note': 'Embedded SWF player',
1258 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1259 'playlist_count': 4,
1260 'info_dict': {
1261 'title': 'JODA7',
acf757f4 1262 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
6b08cdf6 1263 }
4b7df0d3
JMF
1264 }, {
1265 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
1266 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
1267 'info_dict': {
acf757f4
PH
1268 'title': 'Uploads from Interstellar Movie',
1269 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
4b7df0d3
JMF
1270 },
1271 'playlist_mincout': 21,
81127aa5 1272 }]
c5e8d7af 1273
880e1c52
JMF
1274 def _real_initialize(self):
1275 self._login()
1276
652cdaa2 1277 def _extract_mix(self, playlist_id):
99209c29 1278 # The mixes are generated from a single video
652cdaa2 1279 # the id of the playlist is just 'RD' + video_id
7d4afc55 1280 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
c9cc0bf5 1281 webpage = self._download_webpage(
78caa52a 1282 url, playlist_id, 'Downloading Youtube mix')
bc2f773b 1283 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
1284 title_span = (
1285 search_title('playlist-title') or
1286 search_title('title long-title') or
1287 search_title('title'))
76d1700b 1288 title = clean_html(title_span)
c9cc0bf5
PH
1289 ids = orderedSet(re.findall(
1290 r'''(?xs)data-video-username=".*?".*?
1291 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1292 webpage))
652cdaa2
JMF
1293 url_results = self._ids_to_results(ids)
1294
1295 return self.playlist_result(url_results, playlist_id, title)
1296
448830ce 1297 def _extract_playlist(self, playlist_id):
dbb94fb0
S
1298 url = self._TEMPLATE_URL % playlist_id
1299 page = self._download_webpage(url, playlist_id)
1300 more_widget_html = content_html = page
1301
39b62db1
YCH
1302 for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page):
1303 match = match.strip()
1304 # Check if the playlist exists or is private
1305 if re.match(r'[^<]*(The|This) playlist (does not exist|is private)[^<]*', match):
1306 raise ExtractorError(
1307 'The playlist doesn\'t exist or is private, use --username or '
1308 '--netrc to access it.',
1309 expected=True)
1310 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
1311 raise ExtractorError(
1312 'Invalid parameters. Maybe URL is incorrect.',
1313 expected=True)
1314 elif re.match(r'[^<]*Choose your language[^<]*', match):
1315 continue
1316 else:
1317 self.report_warning('Youtube gives an alert message: ' + match)
10c0e2d8 1318
dcbb4580
JMF
1319 # Extract the video ids from the playlist pages
1320 ids = []
c5e8d7af 1321
755eb032 1322 for page_num in itertools.count(1):
dbb94fb0 1323 matches = re.finditer(self._VIDEO_RE, content_html)
6e47b51e
JMF
1324 # We remove the duplicates and the link with index 0
1325 # (it's not the first video of the playlist)
1326 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1327 ids.extend(new_ids)
c5e8d7af 1328
dbb94fb0
S
1329 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1330 if not mobj:
c5e8d7af
PH
1331 break
1332
dbb94fb0 1333 more = self._download_json(
5912c639
PH
1334 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1335 'Downloading page #%s' % page_num,
1336 transform_source=uppercase_escape)
dbb94fb0 1337 content_html = more['content_html']
4b7df0d3
JMF
1338 if not content_html.strip():
1339 # Some webpages show a "Load more" button but they don't
1340 # have more videos
1341 break
dbb94fb0
S
1342 more_widget_html = more['load_more_widget_html']
1343
1344 playlist_title = self._html_search_regex(
68eb8e90 1345 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
78caa52a 1346 page, 'title')
c5e8d7af 1347
652cdaa2 1348 url_results = self._ids_to_results(ids)
dcbb4580 1349 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af 1350
448830ce
S
1351 def _real_extract(self, url):
1352 # Extract playlist id
1353 mobj = re.match(self._VALID_URL, url)
1354 if mobj is None:
1355 raise ExtractorError('Invalid URL: %s' % url)
1356 playlist_id = mobj.group(1) or mobj.group(2)
1357
1358 # Check if it's a video-specific URL
1359 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1360 if 'v' in query_dict:
1361 video_id = query_dict['v'][0]
1362 if self._downloader.params.get('noplaylist'):
1363 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1364 return self.url_result(video_id, 'Youtube', video_id=video_id)
1365 else:
1366 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1367
1368 if playlist_id.startswith('RD') or playlist_id.startswith('UL'):
1369 # Mixes require a custom extraction process
1370 return self._extract_mix(playlist_id)
1371
1372 return self._extract_playlist(playlist_id)
1373
c5e8d7af
PH
1374
1375class YoutubeChannelIE(InfoExtractor):
78caa52a 1376 IE_DESC = 'YouTube.com channels'
9ff67727 1377 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
eb0f3e7e 1378 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
78caa52a 1379 IE_NAME = 'youtube:channel'
cdc628a4
PH
1380 _TESTS = [{
1381 'note': 'paginated channel',
1382 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1383 'playlist_mincount': 91,
acf757f4
PH
1384 'info_dict': {
1385 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
1386 }
cdc628a4 1387 }]
c5e8d7af 1388
6de5dbaf
S
1389 @staticmethod
1390 def extract_videos_from_page(page):
c5e8d7af 1391 ids_in_page = []
fb69240c
S
1392 titles_in_page = []
1393 for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page):
1394 video_id = mobj.group('id')
1395 video_title = unescapeHTML(mobj.group('title'))
1396 try:
1397 idx = ids_in_page.index(video_id)
1398 if video_title and not titles_in_page[idx]:
1399 titles_in_page[idx] = video_title
1400 except ValueError:
1401 ids_in_page.append(video_id)
1402 titles_in_page.append(video_title)
1403 return zip(ids_in_page, titles_in_page)
c5e8d7af
PH
1404
1405 def _real_extract(self, url):
9ff67727 1406 channel_id = self._match_id(url)
c5e8d7af 1407
eb0f3e7e 1408 url = self._TEMPLATE_URL % channel_id
386bdfa6
S
1409
1410 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
1411 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
1412 # otherwise fallback on channel by page extraction
1413 channel_page = self._download_webpage(
1414 url + '?view=57', channel_id,
1415 'Downloading channel page', fatal=False)
1416 channel_playlist_id = self._search_regex(
1417 [r'<meta itemprop="channelId" content="([^"]+)">',
1418 r'data-channel-external-id="([^"]+)"'],
1419 channel_page, 'channel id', default=None)
1420 if channel_playlist_id and channel_playlist_id.startswith('UC'):
1421 playlist_id = 'UU' + channel_playlist_id[2:]
d2a9de78
IK
1422 return self.url_result(
1423 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
386bdfa6 1424
60bf45c8 1425 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
31812a9e
PH
1426 autogenerated = re.search(r'''(?x)
1427 class="[^"]*?(?:
1428 channel-header-autogenerated-label|
1429 yt-channel-title-autogenerated
1430 )[^"]*"''', channel_page) is not None
c5e8d7af 1431
b9643eed
JMF
1432 if autogenerated:
1433 # The videos are contained in a single page
1434 # the ajax pages can't be used, they are empty
b82f815f 1435 entries = [
fb69240c
S
1436 self.url_result(
1437 video_id, 'Youtube', video_id=video_id,
1438 video_title=video_title)
8f02ad4f 1439 for video_id, video_title in self.extract_videos_from_page(channel_page)]
b82f815f
PH
1440 return self.playlist_result(entries, channel_id)
1441
1442 def _entries():
23d3608c 1443 more_widget_html = content_html = channel_page
b9643eed 1444 for pagenum in itertools.count(1):
81c2f20b 1445
8f02ad4f 1446 for video_id, video_title in self.extract_videos_from_page(content_html):
b82f815f 1447 yield self.url_result(
fb69240c
S
1448 video_id, 'Youtube', video_id=video_id,
1449 video_title=video_title)
5f6a1245 1450
23d3608c
JMF
1451 mobj = re.search(
1452 r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
1453 more_widget_html)
1454 if not mobj:
b9643eed 1455 break
c5e8d7af 1456
23d3608c
JMF
1457 more = self._download_json(
1458 'https://youtube.com/%s' % mobj.group('more'), channel_id,
1459 'Downloading page #%s' % (pagenum + 1),
1460 transform_source=uppercase_escape)
1461 content_html = more['content_html']
1462 more_widget_html = more['load_more_widget_html']
1463
b82f815f 1464 return self.playlist_result(_entries(), channel_id)
c5e8d7af
PH
1465
1466
eb0f3e7e 1467class YoutubeUserIE(YoutubeChannelIE):
78caa52a 1468 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
9ff67727 1469 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
eb0f3e7e 1470 _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos'
78caa52a 1471 IE_NAME = 'youtube:user'
c5e8d7af 1472
cdc628a4
PH
1473 _TESTS = [{
1474 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1475 'playlist_mincount': 320,
1476 'info_dict': {
1477 'title': 'TheLinuxFoundation',
1478 }
1479 }, {
1480 'url': 'ytuser:phihag',
1481 'only_matching': True,
1482 }]
1483
e3ea4790 1484 @classmethod
f4b05232 1485 def suitable(cls, url):
e3ea4790
JMF
1486 # Don't return True if the url can be extracted with other youtube
1487 # extractor, the regex would is too permissive and it would match.
1488 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
5f6a1245
JW
1489 if any(ie.suitable(url) for ie in other_ies):
1490 return False
1491 else:
1492 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 1493
b05654f0 1494
b4c08069 1495class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
78caa52a 1496 IE_DESC = 'YouTube.com searches'
b4c08069
JMF
1497 # there doesn't appear to be a real limit, for example if you search for
1498 # 'python' you get more than 8.000.000 results
1499 _MAX_RESULTS = float('inf')
78caa52a 1500 IE_NAME = 'youtube:search'
b05654f0 1501 _SEARCH_KEY = 'ytsearch'
b4c08069 1502 _EXTRA_QUERY_ARGS = {}
9dd8e46a 1503 _TESTS = []
b05654f0 1504
b05654f0
PH
1505 def _get_n_results(self, query, n):
1506 """Get a specified number of results for a query"""
1507
b4c08069 1508 videos = []
b05654f0
PH
1509 limit = n
1510
b4c08069
JMF
1511 for pagenum in itertools.count(1):
1512 url_query = {
1513 'search_query': query,
1514 'page': pagenum,
1515 'spf': 'navigate',
1516 }
1517 url_query.update(self._EXTRA_QUERY_ARGS)
1518 result_url = 'https://www.youtube.com/results?' + compat_urllib_parse.urlencode(url_query)
1519 data = self._download_json(
69ea8ca4 1520 result_url, video_id='query "%s"' % query,
b4c08069 1521 note='Downloading page %s' % pagenum,
69ea8ca4 1522 errnote='Unable to download API page')
b4c08069 1523 html_content = data[1]['body']['content']
7cc3570e 1524
b4c08069 1525 if 'class="search-message' in html_content:
07ad22b8 1526 raise ExtractorError(
78caa52a 1527 '[youtube] No video results', expected=True)
b05654f0 1528
b4c08069
JMF
1529 new_videos = self._ids_to_results(orderedSet(re.findall(
1530 r'href="/watch\?v=(.{11})', html_content)))
1531 videos += new_videos
1532 if not new_videos or len(videos) > limit:
1533 break
b05654f0 1534
b4c08069
JMF
1535 if len(videos) > n:
1536 videos = videos[:n]
b05654f0 1537 return self.playlist_result(videos, query)
75dff0ee 1538
c9ae7b95 1539
a3dd9248 1540class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1541 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248 1542 _SEARCH_KEY = 'ytsearchdate'
78caa52a 1543 IE_DESC = 'YouTube.com searches, newest videos first'
b4c08069 1544 _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
75dff0ee 1545
c9ae7b95
PH
1546
1547class YoutubeSearchURLIE(InfoExtractor):
78caa52a
PH
1548 IE_DESC = 'YouTube.com search URLs'
1549 IE_NAME = 'youtube:search_url'
c9ae7b95 1550 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
1551 _TESTS = [{
1552 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1553 'playlist_mincount': 5,
1554 'info_dict': {
1555 'title': 'youtube-dl test video',
1556 }
1557 }]
c9ae7b95
PH
1558
1559 def _real_extract(self, url):
1560 mobj = re.match(self._VALID_URL, url)
1561 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1562
1563 webpage = self._download_webpage(url, query)
1564 result_code = self._search_regex(
98998cde 1565 r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML')
c9ae7b95
PH
1566
1567 part_codes = re.findall(
1568 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1569 entries = []
1570 for part_code in part_codes:
1571 part_title = self._html_search_regex(
6feb2d5e 1572 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1573 part_url_snippet = self._html_search_regex(
1574 r'(?s)href="([^"]+)"', part_code, 'item URL')
1575 part_url = compat_urlparse.urljoin(
1576 'https://www.youtube.com/', part_url_snippet)
1577 entries.append({
1578 '_type': 'url',
1579 'url': part_url,
1580 'title': part_title,
1581 })
1582
1583 return {
1584 '_type': 'playlist',
1585 'entries': entries,
1586 'title': query,
1587 }
1588
1589
75dff0ee 1590class YoutubeShowIE(InfoExtractor):
78caa52a 1591 IE_DESC = 'YouTube.com (multi-season) shows'
cdc628a4 1592 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 1593 IE_NAME = 'youtube:show'
cdc628a4
PH
1594 _TESTS = [{
1595 'url': 'http://www.youtube.com/show/airdisasters',
1596 'playlist_mincount': 3,
1597 'info_dict': {
1598 'id': 'airdisasters',
1599 'title': 'Air Disasters',
1600 }
1601 }]
75dff0ee
JMF
1602
1603 def _real_extract(self, url):
1604 mobj = re.match(self._VALID_URL, url)
cdc628a4
PH
1605 playlist_id = mobj.group('id')
1606 webpage = self._download_webpage(
1607 url, playlist_id, 'Downloading show webpage')
75dff0ee
JMF
1608 # There's one playlist for each season of the show
1609 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
cdc628a4
PH
1610 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1611 entries = [
1612 self.url_result(
1613 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1614 for season in m_seasons
1615 ]
1616 title = self._og_search_title(webpage, fatal=False)
1617
1618 return {
1619 '_type': 'playlist',
1620 'id': playlist_id,
1621 'title': title,
1622 'entries': entries,
1623 }
04cc9617
JMF
1624
1625
b2e8bc1b 1626class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639 1627 """
25f14e9f 1628 Base class for feed extractors
d7ae0639
JMF
1629 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1630 """
b2e8bc1b 1631 _LOGIN_REQUIRED = True
d7ae0639
JMF
1632
1633 @property
1634 def IE_NAME(self):
78caa52a 1635 return 'youtube:%s' % self._FEED_NAME
04cc9617 1636
81f0259b 1637 def _real_initialize(self):
b2e8bc1b 1638 self._login()
81f0259b 1639
04cc9617 1640 def _real_extract(self, url):
25f14e9f
S
1641 page = self._download_webpage(
1642 'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)
2bc43303
JMF
1643
1644 # The extraction process is the same as for playlists, but the regex
1645 # for the video ids doesn't contain an index
1646 ids = []
1647 more_widget_html = content_html = page
2bc43303
JMF
1648 for page_num in itertools.count(1):
1649 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
62c95fd5
S
1650
1651 # 'recommended' feed has infinite 'load more' and each new portion spins
1652 # the same videos in (sometimes) slightly different order, so we'll check
1653 # for unicity and break when portion has no new videos
1654 new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))
1655 if not new_ids:
1656 break
1657
2bc43303
JMF
1658 ids.extend(new_ids)
1659
1660 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1661 if not mobj:
1662 break
1663
1664 more = self._download_json(
25f14e9f 1665 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
2bc43303
JMF
1666 'Downloading page #%s' % page_num,
1667 transform_source=uppercase_escape)
1668 content_html = more['content_html']
1669 more_widget_html = more['load_more_widget_html']
1670
25f14e9f
S
1671 return self.playlist_result(
1672 self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE)
1673
1674
1675class YoutubeWatchLaterIE(YoutubePlaylistIE):
1676 IE_NAME = 'youtube:watchlater'
1677 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
1678 _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'
1679
1680 _TESTS = [] # override PlaylistIE tests
1681
1682 def _real_extract(self, url):
1683 return self._extract_playlist('WL')
f459d170 1684
5f6a1245 1685
c626a3d9 1686class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 1687 IE_NAME = 'youtube:favorites'
f3a34072 1688 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
c7a7750d 1689 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1690 _LOGIN_REQUIRED = True
1691
1692 def _real_extract(self, url):
1693 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 1694 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 1695 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1696
1697
25f14e9f
S
1698class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1699 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
1700 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1701 _FEED_NAME = 'recommended'
1702 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1ed5b5c9 1703
1ed5b5c9 1704
25f14e9f
S
1705class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1706 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1707 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1708 _FEED_NAME = 'subscriptions'
1709 _PLAYLIST_TITLE = 'Youtube Subscriptions'
1ed5b5c9 1710
1ed5b5c9 1711
25f14e9f
S
1712class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1713 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
1714 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1715 _FEED_NAME = 'history'
1716 _PLAYLIST_TITLE = 'Youtube History'
1ed5b5c9
JMF
1717
1718
15870e90
PH
1719class YoutubeTruncatedURLIE(InfoExtractor):
1720 IE_NAME = 'youtube:truncated_url'
1721 IE_DESC = False # Do not list
975d35db 1722 _VALID_URL = r'''(?x)
b95aab84
PH
1723 (?:https?://)?
1724 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
1725 (?:watch\?(?:
c4808c60 1726 feature=[a-z_]+|
b95aab84
PH
1727 annotation_id=annotation_[^&]+|
1728 x-yt-cl=[0-9]+|
c1708b89 1729 hl=[^&]*|
b95aab84
PH
1730 )?
1731 |
1732 attribution_link\?a=[^&]+
1733 )
1734 $
975d35db 1735 '''
15870e90 1736
c4808c60
PH
1737 _TESTS = [{
1738 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1739 'only_matching': True,
dc2fc736
PH
1740 }, {
1741 'url': 'http://www.youtube.com/watch?',
1742 'only_matching': True,
b95aab84
PH
1743 }, {
1744 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
1745 'only_matching': True,
1746 }, {
1747 'url': 'https://www.youtube.com/watch?feature=foo',
1748 'only_matching': True,
c1708b89
PH
1749 }, {
1750 'url': 'https://www.youtube.com/watch?hl=en-GB',
1751 'only_matching': True,
c4808c60
PH
1752 }]
1753
15870e90
PH
1754 def _real_extract(self, url):
1755 raise ExtractorError(
78caa52a
PH
1756 'Did you forget to quote the URL? Remember that & is a meta '
1757 'character in most shells, so you want to put the URL in quotes, '
1758 'like youtube-dl '
1759 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1760 ' or simply youtube-dl BaW_jenozKc .',
15870e90 1761 expected=True)
772fd5cc
PH
1762
1763
1764class YoutubeTruncatedIDIE(InfoExtractor):
1765 IE_NAME = 'youtube:truncated_id'
1766 IE_DESC = False # Do not list
b95aab84 1767 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
1768
1769 _TESTS = [{
1770 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
1771 'only_matching': True,
1772 }]
1773
1774 def _real_extract(self, url):
1775 video_id = self._match_id(url)
1776 raise ExtractorError(
1777 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
1778 expected=True)