]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[vimeo] Fix password protected videos again (#5082)
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
42939b61 10import time
e0df6211 11import traceback
c5e8d7af 12
b05654f0 13from .common import InfoExtractor, SearchInfoExtractor
2b25cb5d 14from ..jsinterp import JSInterpreter
54256267 15from ..swfinterp import SWFInterpreter
4bb4a188 16from ..compat import (
edf3e38e 17 compat_chr,
c5e8d7af 18 compat_parse_qs,
c5e8d7af
PH
19 compat_urllib_parse,
20 compat_urllib_request,
7c61bd36 21 compat_urlparse,
c5e8d7af 22 compat_str,
4bb4a188
PH
23)
24from ..utils import (
c5e8d7af 25 clean_html,
c5e8d7af 26 ExtractorError,
2d30521a 27 float_or_none,
4bb4a188
PH
28 get_element_by_attribute,
29 get_element_by_id,
dd27fd17 30 int_or_none,
4bb4a188 31 orderedSet,
c5e8d7af
PH
32 unescapeHTML,
33 unified_strdate,
81c2f20b 34 uppercase_escape,
c5e8d7af
PH
35)
36
5f6a1245 37
de7f3446 38class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
39 """Provide base functions for Youtube extractors"""
40 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
83317f69 41 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
b2e8bc1b
JMF
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
45
b2e8bc1b 46 def _set_language(self):
810fb84d
PH
47 self._set_cookie(
48 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
42939b61 49 # YouTube sets the expire time to about two months
810fb84d 50 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b
JMF
51
52 def _login(self):
83317f69 53 """
54 Attempt to log in to YouTube.
55 True is returned if successful or skipped.
56 False is returned if login failed.
57
58 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
59 """
b2e8bc1b
JMF
60 (username, password) = self._get_login_info()
61 # No authentication to be performed
62 if username is None:
63 if self._LOGIN_REQUIRED:
69ea8ca4 64 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 65 return True
b2e8bc1b 66
7cc3570e
PH
67 login_page = self._download_webpage(
68 self._LOGIN_URL, None,
69ea8ca4
PH
69 note='Downloading login page',
70 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
71 if login_page is False:
72 return
b2e8bc1b 73
795f28f8 74 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78caa52a 75 login_page, 'Login GALX parameter')
c5e8d7af 76
b2e8bc1b
JMF
77 # Log in
78 login_form_strs = {
8bcc8756
JW
79 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
80 'Email': username,
81 'GALX': galx,
82 'Passwd': password,
83
84 'PersistentCookie': 'yes',
85 '_utf8': '霱',
86 'bgresponse': 'js_disabled',
87 'checkConnection': '',
88 'checkedDomains': 'youtube',
89 'dnConn': '',
90 'pstMsg': '0',
91 'rmShown': '1',
92 'secTok': '',
93 'signIn': 'Sign in',
94 'timeStmp': '',
95 'service': 'youtube',
96 'uilel': '3',
97 'hl': 'en_US',
b2e8bc1b 98 }
83317f69 99
b2e8bc1b
JMF
100 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
101 # chokes on unicode
5f6a1245 102 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
b2e8bc1b 103 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
104
105 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
106 login_results = self._download_webpage(
107 req, None,
69ea8ca4 108 note='Logging in', errnote='unable to log in', fatal=False)
7cc3570e
PH
109 if login_results is False:
110 return False
83317f69 111
112 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
69ea8ca4 113 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
83317f69 114
115 # Two-Factor
116 # TODO add SMS and phone call support - these require making a request and then prompting the user
117
118 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
119 tfa_code = self._get_tfa_info()
120
121 if tfa_code is None:
69ea8ca4
PH
122 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
123 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
83317f69 124 return False
125
126 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
127
128 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
129 if match is None:
69ea8ca4 130 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
83317f69 131 secTok = match.group(1)
132 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
133 if match is None:
69ea8ca4 134 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
83317f69 135 timeStmp = match.group(1)
136
137 tfa_form_strs = {
78caa52a
PH
138 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
139 'smsToken': '',
140 'smsUserPin': tfa_code,
141 'smsVerifyPin': 'Verify',
142
143 'PersistentCookie': 'yes',
144 'checkConnection': '',
145 'checkedDomains': 'youtube',
146 'pstMsg': '1',
147 'secTok': secTok,
148 'timeStmp': timeStmp,
149 'service': 'youtube',
150 'hl': 'en_US',
83317f69 151 }
5f6a1245 152 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
83317f69 153 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
154
155 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
156 tfa_results = self._download_webpage(
157 tfa_req, None,
69ea8ca4 158 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
83317f69 159
160 if tfa_results is False:
161 return False
162
163 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
69ea8ca4 164 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
83317f69 165 return False
166 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
69ea8ca4 167 self._downloader.report_warning('unable to log in - did the page structure change?')
83317f69 168 return False
169 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
69ea8ca4 170 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
83317f69 171 return False
172
7cc3570e 173 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
69ea8ca4 174 self._downloader.report_warning('unable to log in: bad username or password')
b2e8bc1b
JMF
175 return False
176 return True
177
b2e8bc1b
JMF
178 def _real_initialize(self):
179 if self._downloader is None:
180 return
42939b61 181 self._set_language()
b2e8bc1b
JMF
182 if not self._login():
183 return
c5e8d7af 184
8377574c 185
360e1ca5 186class YoutubeIE(YoutubeBaseInfoExtractor):
78caa52a 187 IE_DESC = 'YouTube.com'
cb7dfeea 188 _VALID_URL = r"""(?x)^
c5e8d7af 189 (
edb53e2d 190 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 191 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 192 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 193 (?:www\.)?pwnyoutube\.com/|
f7000f3a 194 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
195 tube\.majestyc\.net/|
196 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
197 (?:.*?\#/)? # handle anchor (#/) redirect urls
198 (?: # the various things that can precede the ID:
ac7553d0 199 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 200 |(?: # or the v= param in all its forms
f7000f3a 201 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
202 (?:\?|\#!?) # the params delimiter ? or # or #!
203 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
204 v=
205 )
f4b05232
JMF
206 ))
207 |youtu\.be/ # just youtu.be/xxxx
edb53e2d 208 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 209 )
c5e8d7af 210 )? # all until now is optional -> you can pass the naked ID
8963d9c2 211 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
9291475f 212 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
c5e8d7af
PH
213 (?(1).+)? # if we found the ID, everything can follow
214 $"""
c5e8d7af 215 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
216 _formats = {
217 '5': {'ext': 'flv', 'width': 400, 'height': 240},
218 '6': {'ext': 'flv', 'width': 450, 'height': 270},
219 '13': {'ext': '3gp'},
220 '17': {'ext': '3gp', 'width': 176, 'height': 144},
221 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
222 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
223 '34': {'ext': 'flv', 'width': 640, 'height': 360},
224 '35': {'ext': 'flv', 'width': 854, 'height': 480},
225 '36': {'ext': '3gp', 'width': 320, 'height': 240},
226 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
227 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
228 '43': {'ext': 'webm', 'width': 640, 'height': 360},
229 '44': {'ext': 'webm', 'width': 854, 'height': 480},
230 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
231 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
232
1d043b93 233
86fe61c8 234 # 3d videos
43b81eb9
PH
235 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
236 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
237 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
238 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
239 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
240 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
241 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 242
96fb5605 243 # Apple HTTP Live Streaming
43b81eb9
PH
244 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
245 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
246 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
247 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
248 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
249 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
250 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
251
252 # DASH mp4 video
43b81eb9
PH
253 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
254 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
255 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
256 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
257 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
e65566a9 258 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
43b81eb9
PH
259 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
260 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
0d2c1418
PH
261 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
262 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
263 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
836a086c 264
f6f1fc92 265 # Dash mp4 audio
62cd676c
PH
266 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
267 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
268 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
836a086c
AZ
269
270 # Dash webm
e75cafe9
A
271 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
272 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
273 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
274 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
275 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
276 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
1cc887cb 277 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
e75cafe9
A
278 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
279 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
280 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
281 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
282 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
283 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
284 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 285 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 286 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
a51d3aa0
PH
287 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
288 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
76b3c610 289 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
10a404c3 290 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
76b3c610 291 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
2c62dc26
PH
292
293 # Dash webm audio
55db73ef 294 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
e75cafe9 295 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d 296
0857baad
PH
297 # Dash webm audio with opus inside
298 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
299 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
300 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
301
ce6b9a2d
PH
302 # RTMP (unnamed)
303 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 304 }
836a086c 305
78caa52a 306 IE_NAME = 'youtube'
2eb88d95
PH
307 _TESTS = [
308 {
4bc3a23e
PH
309 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
310 'info_dict': {
311 'id': 'BaW_jenozKc',
312 'ext': 'mp4',
313 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
314 'uploader': 'Philipp Hagemeister',
315 'uploader_id': 'phihag',
316 'upload_date': '20121002',
317 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
318 'categories': ['Science & Technology'],
3e7c1224
PH
319 'like_count': int,
320 'dislike_count': int,
2eb88d95 321 }
0e853ca4 322 },
0e853ca4 323 {
4bc3a23e
PH
324 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
325 'note': 'Test generic use_cipher_signature video (#897)',
326 'info_dict': {
327 'id': 'UxxajLWwzqY',
328 'ext': 'mp4',
329 'upload_date': '20120506',
330 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
331 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
332 'uploader': 'Icona Pop',
333 'uploader_id': 'IconaPop',
2eb88d95 334 }
c108eb73
JMF
335 },
336 {
4bc3a23e
PH
337 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
338 'note': 'Test VEVO video with age protection (#956)',
339 'info_dict': {
340 'id': '07FYdnEawAQ',
341 'ext': 'mp4',
342 'upload_date': '20130703',
343 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
344 'description': 'md5:64249768eec3bc4276236606ea996373',
345 'uploader': 'justintimberlakeVEVO',
346 'uploader_id': 'justintimberlakeVEVO',
c108eb73
JMF
347 }
348 },
fccd3771 349 {
4bc3a23e
PH
350 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
351 'note': 'Embed-only video (#1746)',
352 'info_dict': {
353 'id': 'yZIXLfi8CZQ',
354 'ext': 'mp4',
355 'upload_date': '20120608',
356 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
357 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
358 'uploader': 'SET India',
359 'uploader_id': 'setindia'
fccd3771
PH
360 }
361 },
dd27fd17 362 {
4bc3a23e
PH
363 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
364 'note': '256k DASH audio (format 141) via DASH manifest',
365 'info_dict': {
366 'id': 'a9LDPn-MO4I',
367 'ext': 'm4a',
368 'upload_date': '20121002',
369 'uploader_id': '8KVIDEO',
370 'description': '',
371 'uploader': '8KVIDEO',
372 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 373 },
4bc3a23e
PH
374 'params': {
375 'youtube_include_dash_manifest': True,
376 'format': '141',
4919603f 377 },
dd27fd17 378 },
3489b7d2
JMF
379 # DASH manifest with encrypted signature
380 {
78caa52a
PH
381 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
382 'info_dict': {
383 'id': 'IB3lcPjvWLA',
384 'ext': 'm4a',
b766eb27
JMF
385 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
386 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
78caa52a
PH
387 'uploader': 'AfrojackVEVO',
388 'uploader_id': 'AfrojackVEVO',
389 'upload_date': '20131011',
3489b7d2 390 },
4bc3a23e 391 'params': {
78caa52a
PH
392 'youtube_include_dash_manifest': True,
393 'format': '141',
3489b7d2
JMF
394 },
395 },
aaeb86f6
S
396 # JS player signature function name containing $
397 {
398 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
399 'info_dict': {
400 'id': 'nfWlot6h_JM',
401 'ext': 'm4a',
402 'title': 'Taylor Swift - Shake It Off',
403 'description': 'md5:2acfda1b285bdd478ccec22f9918199d',
404 'uploader': 'TaylorSwiftVEVO',
405 'uploader_id': 'TaylorSwiftVEVO',
406 'upload_date': '20140818',
407 },
408 'params': {
409 'youtube_include_dash_manifest': True,
410 'format': '141',
411 },
412 },
aa79ac0c
PH
413 # Controversy video
414 {
415 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
416 'info_dict': {
417 'id': 'T4XJQO3qol8',
418 'ext': 'mp4',
419 'upload_date': '20100909',
420 'uploader': 'The Amazing Atheist',
421 'uploader_id': 'TheAmazingAtheist',
422 'title': 'Burning Everyone\'s Koran',
423 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
424 }
c522adb1
JMF
425 },
426 # Normal age-gate video (No vevo, embed allowed)
427 {
428 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
429 'info_dict': {
430 'id': 'HtVdAasjOgU',
431 'ext': 'mp4',
432 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
9ed99402 433 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
c522adb1
JMF
434 'uploader': 'The Witcher',
435 'uploader_id': 'WitcherGame',
436 'upload_date': '20140605',
437 },
438 },
fccae2b9
S
439 # Age-gate video with encrypted signature
440 {
441 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',
442 'info_dict': {
443 'id': '6kLq3WMV1nU',
444 'ext': 'mp4',
445 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
446 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
447 'uploader': 'LloydVEVO',
448 'uploader_id': 'LloydVEVO',
449 'upload_date': '20110629',
450 },
451 },
774e208f
PH
452 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
453 {
454 'url': '__2ABJjxzNo',
455 'info_dict': {
456 'id': '__2ABJjxzNo',
457 'ext': 'mp4',
458 'upload_date': '20100430',
459 'uploader_id': 'deadmau5',
460 'description': 'md5:12c56784b8032162bb936a5f76d55360',
461 'uploader': 'deadmau5',
462 'title': 'Deadmau5 - Some Chords (HD)',
463 },
464 'expected_warnings': [
465 'DASH manifest missing',
466 ]
e52a40ab
PH
467 },
468 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
469 {
470 'url': 'lqQg6PlCWgI',
471 'info_dict': {
472 'id': 'lqQg6PlCWgI',
473 'ext': 'mp4',
cbe2bd91
PH
474 'upload_date': '20120731',
475 'uploader_id': 'olympic',
476 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
477 'uploader': 'Olympics',
478 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
479 },
480 'params': {
481 'skip_download': 'requires avconv',
e52a40ab 482 }
cbe2bd91 483 },
6271f1ca
PH
484 # Non-square pixels
485 {
486 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
487 'info_dict': {
488 'id': '_b-2C3KPAM0',
489 'ext': 'mp4',
490 'stretched_ratio': 16 / 9.,
491 'upload_date': '20110310',
492 'uploader_id': 'AllenMeow',
493 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
494 'uploader': '孫艾倫',
495 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
496 },
06b491eb
S
497 },
498 # url_encoded_fmt_stream_map is empty string
499 {
500 'url': 'qEJwOuvDf7I',
501 'info_dict': {
502 'id': 'qEJwOuvDf7I',
503 'ext': 'mp4',
504 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
505 'description': '',
506 'upload_date': '20150404',
507 'uploader_id': 'spbelect',
508 'uploader': 'Наблюдатели Петербурга',
509 },
510 'params': {
511 'skip_download': 'requires avconv',
512 }
513 },
2eb88d95
PH
514 ]
515
e0df6211
PH
516 def __init__(self, *args, **kwargs):
517 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 518 self._player_cache = {}
e0df6211 519
c5e8d7af
PH
520 def report_video_info_webpage_download(self, video_id):
521 """Report attempt to download video info webpage."""
69ea8ca4 522 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 523
c5e8d7af
PH
524 def report_information_extraction(self, video_id):
525 """Report attempt to extract video information."""
69ea8ca4 526 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
527
528 def report_unavailable_format(self, video_id, format):
529 """Report extracted video URL."""
69ea8ca4 530 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
531
532 def report_rtmp_download(self):
533 """Indicate the download will use the RTMP protocol."""
69ea8ca4 534 self.to_screen('RTMP download detected')
c5e8d7af 535
60064c53
PH
536 def _signature_cache_id(self, example_sig):
537 """ Return a string representation of a signature """
78caa52a 538 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
539
540 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 541 id_m = re.match(
60620368 542 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
cf010131 543 player_url)
c081b35c
PH
544 if not id_m:
545 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
546 player_type = id_m.group('ext')
547 player_id = id_m.group('id')
548
c4417ddb 549 # Read from filesystem cache
60064c53
PH
550 func_id = '%s_%s_%s' % (
551 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 552 assert os.path.basename(func_id) == func_id
a0e07d31 553
69ea8ca4 554 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 555 if cache_spec is not None:
78caa52a 556 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 557
6d1a55a5
PH
558 download_note = (
559 'Downloading player %s' % player_url
560 if self._downloader.params.get('verbose') else
561 'Downloading %s player %s' % (player_type, player_id)
562 )
e0df6211
PH
563 if player_type == 'js':
564 code = self._download_webpage(
565 player_url, video_id,
6d1a55a5 566 note=download_note,
69ea8ca4 567 errnote='Download of %s failed' % player_url)
83799698 568 res = self._parse_sig_js(code)
c4417ddb 569 elif player_type == 'swf':
e0df6211
PH
570 urlh = self._request_webpage(
571 player_url, video_id,
6d1a55a5 572 note=download_note,
69ea8ca4 573 errnote='Download of %s failed' % player_url)
e0df6211 574 code = urlh.read()
83799698 575 res = self._parse_sig_swf(code)
e0df6211
PH
576 else:
577 assert False, 'Invalid player type %r' % player_type
578
785521bf
PH
579 test_string = ''.join(map(compat_chr, range(len(example_sig))))
580 cache_res = res(test_string)
581 cache_spec = [ord(c) for c in cache_res]
83799698 582
69ea8ca4 583 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
584 return res
585
60064c53 586 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
587 def gen_sig_code(idxs):
588 def _genslice(start, end, step):
78caa52a 589 starts = '' if start == 0 else str(start)
8bcc8756 590 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 591 steps = '' if step == 1 else (':%d' % step)
78caa52a 592 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
593
594 step = None
7af808a5
PH
595 # Quelch pyflakes warnings - start will be set when step is set
596 start = '(Never used)'
edf3e38e
PH
597 for i, prev in zip(idxs[1:], idxs[:-1]):
598 if step is not None:
599 if i - prev == step:
600 continue
601 yield _genslice(start, prev, step)
602 step = None
603 continue
604 if i - prev in [-1, 1]:
605 step = i - prev
606 start = prev
607 continue
608 else:
78caa52a 609 yield 's[%d]' % prev
edf3e38e 610 if step is None:
78caa52a 611 yield 's[%d]' % i
edf3e38e
PH
612 else:
613 yield _genslice(start, i, step)
614
78caa52a 615 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 616 cache_res = func(test_string)
edf3e38e 617 cache_spec = [ord(c) for c in cache_res]
78caa52a 618 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
619 signature_id_tuple = '(%s)' % (
620 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 621 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 622 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 623 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 624
e0df6211
PH
625 def _parse_sig_js(self, jscode):
626 funcname = self._search_regex(
aaeb86f6 627 r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode,
8bcc8756 628 'Initial JS player signature function name')
2b25cb5d
PH
629
630 jsi = JSInterpreter(jscode)
631 initial_function = jsi.extract_function(funcname)
e0df6211
PH
632 return lambda s: initial_function([s])
633
634 def _parse_sig_swf(self, file_contents):
54256267 635 swfi = SWFInterpreter(file_contents)
78caa52a 636 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 637 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 638 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
639 return lambda s: initial_function([s])
640
83799698 641 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 642 """Turn the encrypted s field into a working signature"""
6b37f0be 643
c8bf86d5 644 if player_url is None:
69ea8ca4 645 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 646
69ea8ca4 647 if player_url.startswith('//'):
78caa52a 648 player_url = 'https:' + player_url
c8bf86d5 649 try:
62af3a0e 650 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
651 if player_id not in self._player_cache:
652 func = self._extract_signature_function(
60064c53 653 video_id, player_url, s
c8bf86d5
PH
654 )
655 self._player_cache[player_id] = func
656 func = self._player_cache[player_id]
657 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 658 self._print_sig_code(func, s)
c8bf86d5
PH
659 return func(s)
660 except Exception as e:
661 tb = traceback.format_exc()
662 raise ExtractorError(
78caa52a 663 'Signature extraction failed: ' + tb, cause=e)
e0df6211 664
360e1ca5 665 def _get_subtitles(self, video_id, webpage):
de7f3446 666 try:
60e47a26 667 subs_doc = self._download_xml(
38c2e5b8 668 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
669 video_id, note=False)
670 except ExtractorError as err:
69ea8ca4 671 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
de7f3446 672 return {}
de7f3446
JMF
673
674 sub_lang_list = {}
60e47a26
JMF
675 for track in subs_doc.findall('track'):
676 lang = track.attrib['lang_code']
7e660ac1
LD
677 if lang in sub_lang_list:
678 continue
360e1ca5
JMF
679 sub_formats = []
680 for ext in ['sbv', 'vtt', 'srt']:
681 params = compat_urllib_parse.urlencode({
682 'lang': lang,
683 'v': video_id,
684 'fmt': ext,
685 'name': track.attrib['name'].encode('utf-8'),
686 })
687 sub_formats.append({
688 'url': 'https://www.youtube.com/api/timedtext?' + params,
689 'ext': ext,
690 })
691 sub_lang_list[lang] = sub_formats
de7f3446 692 if not sub_lang_list:
69ea8ca4 693 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
694 return {}
695 return sub_lang_list
696
360e1ca5 697 def _get_automatic_captions(self, video_id, webpage):
de7f3446
JMF
698 """We need the webpage for getting the captions url, pass it as an
699 argument to speed up the process."""
69ea8ca4 700 self.to_screen('%s: Looking for automatic captions' % video_id)
de7f3446 701 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
78caa52a 702 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
703 if mobj is None:
704 self._downloader.report_warning(err_msg)
705 return {}
706 player_config = json.loads(mobj.group(1))
707 try:
0792d563
PH
708 args = player_config['args']
709 caption_url = args['ttsurl']
710 timestamp = args['timestamp']
055e6f36
JMF
711 # We get the available subtitles
712 list_params = compat_urllib_parse.urlencode({
713 'type': 'list',
714 'tlangs': 1,
715 'asrs': 1,
de7f3446 716 })
055e6f36 717 list_url = caption_url + '&' + list_params
e26f8712 718 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 719 original_lang_node = caption_list.find('track')
7d900ef1 720 if original_lang_node is None:
69ea8ca4 721 self._downloader.report_warning('Video doesn\'t have automatic captions')
e3dc22ca
JMF
722 return {}
723 original_lang = original_lang_node.attrib['lang_code']
7d900ef1 724 caption_kind = original_lang_node.attrib.get('kind', '')
055e6f36
JMF
725
726 sub_lang_list = {}
727 for lang_node in caption_list.findall('target'):
728 sub_lang = lang_node.attrib['lang_code']
360e1ca5
JMF
729 sub_formats = []
730 for ext in ['sbv', 'vtt', 'srt']:
731 params = compat_urllib_parse.urlencode({
732 'lang': original_lang,
733 'tlang': sub_lang,
734 'fmt': ext,
735 'ts': timestamp,
736 'kind': caption_kind,
737 })
738 sub_formats.append({
739 'url': caption_url + '&' + params,
740 'ext': ext,
741 })
742 sub_lang_list[sub_lang] = sub_formats
055e6f36 743 return sub_lang_list
de7f3446
JMF
744 # An extractor error can be raise by the download process if there are
745 # no automatic captions but there are subtitles
746 except (KeyError, ExtractorError):
747 self._downloader.report_warning(err_msg)
748 return {}
749
97665381
PH
750 @classmethod
751 def extract_id(cls, url):
752 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 753 if mobj is None:
69ea8ca4 754 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
755 video_id = mobj.group(2)
756 return video_id
757
1d043b93
JMF
758 def _extract_from_m3u8(self, manifest_url, video_id):
759 url_map = {}
5f6a1245 760
1d043b93
JMF
761 def _get_urls(_manifest):
762 lines = _manifest.split('\n')
763 urls = filter(lambda l: l and not l.startswith('#'),
8bcc8756 764 lines)
1d043b93 765 return urls
78caa52a 766 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
1d043b93
JMF
767 formats_urls = _get_urls(manifest)
768 for format_url in formats_urls:
890f62e8 769 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
770 url_map[itag] = format_url
771 return url_map
772
1fb07d10
JG
773 def _extract_annotations(self, video_id):
774 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 775 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 776
da276600
PH
777 def _parse_dash_manifest(
778 self, video_id, dash_manifest_url, player_url, age_gate):
774e208f
PH
779 def decrypt_sig(mobj):
780 s = mobj.group(1)
781 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
782 return '/signature/%s' % dec_s
783 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
784 dash_doc = self._download_xml(
785 dash_manifest_url, video_id,
786 note='Downloading DASH manifest',
787 errnote='Could not download DASH manifest')
788
789 formats = []
de5c5456
YCH
790 for a in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}AdaptationSet'):
791 mime_type = a.attrib.get('mimeType')
792 for r in a.findall('{urn:mpeg:DASH:schema:MPD:2011}Representation'):
793 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
794 if url_el is None:
795 continue
796 if mime_type == 'text/vtt':
797 # TODO implement WebVTT downloading
798 pass
799 elif mime_type.startswith('audio/') or mime_type.startswith('video/'):
800 format_id = r.attrib['id']
801 video_url = url_el.text
802 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
803 f = {
804 'format_id': format_id,
805 'url': video_url,
806 'width': int_or_none(r.attrib.get('width')),
807 'height': int_or_none(r.attrib.get('height')),
808 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
809 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
810 'filesize': filesize,
811 'fps': int_or_none(r.attrib.get('frameRate')),
812 }
813 try:
814 existing_format = next(
815 fo for fo in formats
816 if fo['format_id'] == format_id)
817 except StopIteration:
818 full_info = self._formats.get(format_id, {}).copy()
819 full_info.update(f)
820 formats.append(full_info)
821 else:
822 existing_format.update(f)
823 else:
824 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
774e208f
PH
825 return formats
826
c5e8d7af 827 def _real_extract(self, url):
7e8c0af0 828 proto = (
78caa52a
PH
829 'http' if self._downloader.params.get('prefer_insecure', False)
830 else 'https')
7e8c0af0 831
c5e8d7af
PH
832 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
833 mobj = re.search(self._NEXT_URL_RE, url)
834 if mobj:
7e8c0af0 835 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
97665381 836 video_id = self.extract_id(url)
c5e8d7af
PH
837
838 # Get video webpage
aa79ac0c 839 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
a1f934b1 840 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
841
842 # Attempt to extract SWF player URL
e0df6211 843 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
844 if mobj is not None:
845 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
846 else:
847 player_url = None
848
849 # Get video info
6449cd80 850 embed_webpage = None
c108eb73 851 if re.search(r'player-age-gate-content">', video_webpage) is not None:
c108eb73
JMF
852 age_gate = True
853 # We simulate the access to the video from www.youtube.com/v/{video_id}
854 # this can be viewed without login into Youtube
beb95e77
CL
855 url = proto + '://www.youtube.com/embed/%s' % video_id
856 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
2c57c7fa
JMF
857 data = compat_urllib_parse.urlencode({
858 'video_id': video_id,
859 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934 860 'sts': self._search_regex(
beb95e77 861 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
2c57c7fa 862 })
7e8c0af0 863 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
94bd3613
PH
864 video_info_webpage = self._download_webpage(
865 video_info_url, video_id,
20436c30 866 note='Refetching age-gated info webpage',
94bd3613 867 errnote='unable to download video info webpage')
c5e8d7af 868 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
869 else:
870 age_gate = False
4e62ebe2
JMF
871 try:
872 # Try looking directly into the video webpage
873 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
874 if not mobj:
875 raise ValueError('Could not find ytplayer.config') # caught below
876 json_code = uppercase_escape(mobj.group(1))
877 ytplayer_config = json.loads(json_code)
878 args = ytplayer_config['args']
879 # Convert to the same format returned by compat_parse_qs
880 video_info = dict((k, [v]) for k, v in args.items())
e40bd5f0 881 if not args.get('url_encoded_fmt_stream_map'):
4e62ebe2
JMF
882 raise ValueError('No stream_map present') # caught below
883 except ValueError:
884 # We fallback to the get_video_info pages (used by the embed page)
885 self.report_video_info_webpage_download(video_id)
886 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
810fb84d
PH
887 video_info_url = (
888 '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
889 % (proto, video_id, el_type))
890 video_info_webpage = self._download_webpage(
891 video_info_url,
4e62ebe2
JMF
892 video_id, note=False,
893 errnote='unable to download video info webpage')
894 video_info = compat_parse_qs(video_info_webpage)
895 if 'token' in video_info:
896 break
c5e8d7af
PH
897 if 'token' not in video_info:
898 if 'reason' in video_info:
d11271dd 899 raise ExtractorError(
78caa52a 900 'YouTube said: %s' % video_info['reason'][0],
d11271dd 901 expected=True, video_id=video_id)
c5e8d7af 902 else:
d11271dd 903 raise ExtractorError(
78caa52a 904 '"token" parameter not in video info for unknown reason',
d11271dd 905 video_id=video_id)
c5e8d7af 906
1d699755
PH
907 if 'view_count' in video_info:
908 view_count = int(video_info['view_count'][0])
909 else:
910 view_count = None
911
c5e8d7af
PH
912 # Check for "rental" videos
913 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
69ea8ca4 914 raise ExtractorError('"rental" videos not supported')
c5e8d7af
PH
915
916 # Start extracting information
917 self.report_information_extraction(video_id)
918
919 # uploader
920 if 'author' not in video_info:
69ea8ca4 921 raise ExtractorError('Unable to extract uploader name')
c5e8d7af
PH
922 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
923
924 # uploader_id
925 video_uploader_id = None
926 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
927 if mobj is not None:
928 video_uploader_id = mobj.group(1)
929 else:
69ea8ca4 930 self._downloader.report_warning('unable to extract uploader nickname')
c5e8d7af
PH
931
932 # title
a8c6b241 933 if 'title' in video_info:
aa92f063 934 video_title = video_info['title'][0]
a8c6b241 935 else:
69ea8ca4 936 self._downloader.report_warning('Unable to extract video title')
78caa52a 937 video_title = '_'
c5e8d7af
PH
938
939 # thumbnail image
7763b04e
JMF
940 # We try first to get a high quality image:
941 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
942 video_webpage, re.DOTALL)
943 if m_thumb is not None:
944 video_thumbnail = m_thumb.group(1)
945 elif 'thumbnail_url' not in video_info:
69ea8ca4 946 self._downloader.report_warning('unable to extract video thumbnail')
f490e77e 947 video_thumbnail = None
c5e8d7af
PH
948 else: # don't panic if we can't find it
949 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
950
951 # upload date
952 upload_date = None
ad3bc6ac 953 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
beee53de
PH
954 if mobj is None:
955 mobj = re.search(
263bd4ec 956 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
beee53de 957 video_webpage)
c5e8d7af
PH
958 if mobj is not None:
959 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
960 upload_date = unified_strdate(upload_date)
961
55f7bd2d
PH
962 m_cat_container = self._search_regex(
963 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
624dcebf 964 video_webpage, 'categories', default=None)
ec8deefc 965 if m_cat_container:
ad3bc6ac 966 category = self._html_search_regex(
01ed5c9b 967 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
968 default=None)
969 video_categories = None if category is None else [category]
970 else:
971 video_categories = None
ec8deefc 972
c5e8d7af
PH
973 # description
974 video_description = get_element_by_id("eow-description", video_webpage)
975 if video_description:
27dcce19
PH
976 video_description = re.sub(r'''(?x)
977 <a\s+
978 (?:[a-zA-Z-]+="[^"]+"\s+)*?
979 title="([^"]+)"\s+
980 (?:[a-zA-Z-]+="[^"]+"\s+)*?
981 class="yt-uix-redirect-link"\s*>
982 [^<]+
983 </a>
984 ''', r'\1', video_description)
c5e8d7af
PH
985 video_description = clean_html(video_description)
986 else:
987 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
988 if fd_mobj:
989 video_description = unescapeHTML(fd_mobj.group(1))
990 else:
78caa52a 991 video_description = ''
c5e8d7af 992
f30a38be 993 def _extract_count(count_name):
46374a56 994 count = self._search_regex(
f30a38be
JMF
995 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
996 video_webpage, count_name, default=None)
336c3a69
JMF
997 if count is not None:
998 return int(count.replace(',', ''))
999 return None
69ea8ca4
PH
1000 like_count = _extract_count('like')
1001 dislike_count = _extract_count('dislike')
336c3a69 1002
c5e8d7af 1003 # subtitles
d82134c3 1004 video_subtitles = self.extract_subtitles(video_id, video_webpage)
360e1ca5 1005 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
c5e8d7af
PH
1006
1007 if 'length_seconds' not in video_info:
69ea8ca4 1008 self._downloader.report_warning('unable to extract video duration')
b466b702 1009 video_duration = None
c5e8d7af 1010 else:
b466b702 1011 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1012
1fb07d10
JG
1013 # annotations
1014 video_annotations = None
1015 if self._downloader.params.get('writeannotations', False):
5f6a1245 1016 video_annotations = self._extract_annotations(video_id)
1fb07d10 1017
dd27fd17
PH
1018 def _map_to_format_list(urlmap):
1019 formats = []
1020 for itag, video_real_url in urlmap.items():
1021 dct = {
1022 'format_id': itag,
1023 'url': video_real_url,
1024 'player_url': player_url,
1025 }
0b65e5d4
PH
1026 if itag in self._formats:
1027 dct.update(self._formats[itag])
dd27fd17
PH
1028 formats.append(dct)
1029 return formats
1030
c5e8d7af
PH
1031 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1032 self.report_rtmp_download()
dd27fd17
PH
1033 formats = [{
1034 'format_id': '_rtmp',
1035 'protocol': 'rtmp',
1036 'url': video_info['conn'][0],
1037 'player_url': player_url,
1038 }]
24270b03 1039 elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
5f6a1245 1040 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 1041 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1042 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1043 url_map = {}
00fe14fc 1044 for url_data_str in encoded_url_map.split(','):
c5e8d7af 1045 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
1046 if 'itag' not in url_data or 'url' not in url_data:
1047 continue
1048 format_id = url_data['itag'][0]
1049 url = url_data['url'][0]
1050
1051 if 'sig' in url_data:
1052 url += '&signature=' + url_data['sig'][0]
1053 elif 's' in url_data:
1054 encrypted_sig = url_data['s'][0]
6449cd80 1055 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
201e9eaa 1056
beb95e77 1057 jsplayer_url_json = self._search_regex(
6449cd80
PH
1058 ASSETS_RE,
1059 embed_webpage if age_gate else video_webpage,
1060 'JS player URL (1)', default=None)
1061 if not jsplayer_url_json and not age_gate:
1062 # We need the embed website after all
1063 if embed_webpage is None:
1064 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1065 embed_webpage = self._download_webpage(
1066 embed_url, video_id, 'Downloading embed webpage')
1067 jsplayer_url_json = self._search_regex(
1068 ASSETS_RE, embed_webpage, 'JS player URL')
1069
beb95e77 1070 player_url = json.loads(jsplayer_url_json)
201e9eaa
PH
1071 if player_url is None:
1072 player_url_json = self._search_regex(
1073 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 1074 video_webpage, 'age gate player URL')
201e9eaa
PH
1075 player_url = json.loads(player_url_json)
1076
1077 if self._downloader.params.get('verbose'):
cf010131 1078 if player_url is None:
201e9eaa
PH
1079 player_version = 'unknown'
1080 player_desc = 'unknown'
1081 else:
1082 if player_url.endswith('swf'):
1083 player_version = self._search_regex(
1084 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 1085 'flash player', fatal=False)
201e9eaa 1086 player_desc = 'flash player %s' % player_version
cf010131 1087 else:
201e9eaa
PH
1088 player_version = self._search_regex(
1089 r'html5player-([^/]+?)(?:/html5player)?\.js',
1090 player_url,
1091 'html5 player', fatal=False)
78caa52a 1092 player_desc = 'html5 player %s' % player_version
201e9eaa 1093
60064c53 1094 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 1095 self.to_screen('{%s} signature length %s, %s' %
9e1a5b84 1096 (format_id, parts_sizes, player_desc))
201e9eaa
PH
1097
1098 signature = self._decrypt_signature(
1099 encrypted_sig, video_id, player_url, age_gate)
1100 url += '&signature=' + signature
1101 if 'ratebypass' not in url:
1102 url += '&ratebypass=yes'
1103 url_map[format_id] = url
dd27fd17 1104 formats = _map_to_format_list(url_map)
1d043b93
JMF
1105 elif video_info.get('hlsvp'):
1106 manifest_url = video_info['hlsvp'][0]
1107 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1108 formats = _map_to_format_list(url_map)
c5e8d7af 1109 else:
69ea8ca4 1110 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1111
dd27fd17 1112 # Look for the DASH manifest
203fb43f 1113 if self._downloader.params.get('youtube_include_dash_manifest', True):
774e208f 1114 dash_mpd = video_info.get('dashmpd')
75111274 1115 if dash_mpd:
774e208f
PH
1116 dash_manifest_url = dash_mpd[0]
1117 try:
1118 dash_formats = self._parse_dash_manifest(
da276600 1119 video_id, dash_manifest_url, player_url, age_gate)
774e208f
PH
1120 except (ExtractorError, KeyError) as e:
1121 self.report_warning(
1122 'Skipping DASH manifest: %r' % e, video_id)
1123 else:
e65566a9
PH
1124 # Hide the formats we found through non-DASH
1125 dash_keys = set(df['format_id'] for df in dash_formats)
1126 for f in formats:
1127 if f['format_id'] in dash_keys:
1128 f['format_id'] = 'nondash-%s' % f['format_id']
ee61f6f3 1129 f['preference'] = f.get('preference', 0) - 10000
774e208f 1130 formats.extend(dash_formats)
d80044c2 1131
6271f1ca
PH
1132 # Check for malformed aspect ratio
1133 stretched_m = re.search(
1134 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
1135 video_webpage)
1136 if stretched_m:
1137 ratio = float(stretched_m.group('w')) / float(stretched_m.group('h'))
1138 for f in formats:
1139 if f.get('vcodec') != 'none':
1140 f['stretched_ratio'] = ratio
1141
4bcc7bd1 1142 self._sort_formats(formats)
4ea3be0a 1143
1144 return {
8bcc8756
JW
1145 'id': video_id,
1146 'uploader': video_uploader,
1147 'uploader_id': video_uploader_id,
1148 'upload_date': upload_date,
1149 'title': video_title,
1150 'thumbnail': video_thumbnail,
1151 'description': video_description,
1152 'categories': video_categories,
1153 'subtitles': video_subtitles,
360e1ca5 1154 'automatic_captions': automatic_captions,
8bcc8756
JW
1155 'duration': video_duration,
1156 'age_limit': 18 if age_gate else 0,
1157 'annotations': video_annotations,
7e8c0af0 1158 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 1159 'view_count': view_count,
4ea3be0a 1160 'like_count': like_count,
1161 'dislike_count': dislike_count,
2d30521a 1162 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
8bcc8756 1163 'formats': formats,
4ea3be0a 1164 }
c5e8d7af 1165
5f6a1245 1166
880e1c52 1167class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
78caa52a 1168 IE_DESC = 'YouTube.com playlists'
d67cc9fa 1169 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1170 (?:https?://)?
1171 (?:\w+\.)?
1172 youtube\.com/
1173 (?:
ac7553d0 1174 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
c5e8d7af
PH
1175 \? (?:.*?&)*? (?:p|a|list)=
1176 | p/
1177 )
d67cc9fa 1178 (
99209c29 1179 (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,}
5f6a1245 1180 # Top tracks, they can also include dots
d67cc9fa
JMF
1181 |(?:MC)[\w\.]*
1182 )
c5e8d7af
PH
1183 .*
1184 |
99209c29 1185 ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})
c5e8d7af 1186 )"""
dbb94fb0 1187 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dbb94fb0 1188 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
78caa52a 1189 IE_NAME = 'youtube:playlist'
81127aa5
PH
1190 _TESTS = [{
1191 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1192 'info_dict': {
1193 'title': 'ytdl test PL',
a1cf99d0 1194 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
81127aa5
PH
1195 },
1196 'playlist_count': 3,
9291475f
PH
1197 }, {
1198 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1199 'info_dict': {
acf757f4 1200 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
9291475f
PH
1201 'title': 'YDL_Empty_List',
1202 },
1203 'playlist_count': 0,
1204 }, {
1205 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1206 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1207 'info_dict': {
1208 'title': '29C3: Not my department',
acf757f4 1209 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
9291475f
PH
1210 },
1211 'playlist_count': 95,
1212 }, {
1213 'note': 'issue #673',
1214 'url': 'PLBB231211A4F62143',
1215 'info_dict': {
f46a8702 1216 'title': '[OLD]Team Fortress 2 (Class-based LP)',
acf757f4 1217 'id': 'PLBB231211A4F62143',
9291475f
PH
1218 },
1219 'playlist_mincount': 26,
1220 }, {
1221 'note': 'Large playlist',
1222 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1223 'info_dict': {
1224 'title': 'Uploads from Cauchemar',
acf757f4 1225 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
9291475f
PH
1226 },
1227 'playlist_mincount': 799,
1228 }, {
1229 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1230 'info_dict': {
1231 'title': 'YDL_safe_search',
acf757f4 1232 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
9291475f
PH
1233 },
1234 'playlist_count': 2,
ac7553d0
PH
1235 }, {
1236 'note': 'embedded',
1237 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1238 'playlist_count': 4,
1239 'info_dict': {
1240 'title': 'JODA15',
acf757f4 1241 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
ac7553d0 1242 }
6b08cdf6
PH
1243 }, {
1244 'note': 'Embedded SWF player',
1245 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1246 'playlist_count': 4,
1247 'info_dict': {
1248 'title': 'JODA7',
acf757f4 1249 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
6b08cdf6 1250 }
4b7df0d3
JMF
1251 }, {
1252 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
1253 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
1254 'info_dict': {
acf757f4
PH
1255 'title': 'Uploads from Interstellar Movie',
1256 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
4b7df0d3
JMF
1257 },
1258 'playlist_mincout': 21,
81127aa5 1259 }]
c5e8d7af 1260
880e1c52
JMF
1261 def _real_initialize(self):
1262 self._login()
1263
652cdaa2 1264 def _ids_to_results(self, ids):
c9cc0bf5
PH
1265 return [
1266 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1267 for vid_id in ids]
652cdaa2
JMF
1268
1269 def _extract_mix(self, playlist_id):
99209c29 1270 # The mixes are generated from a single video
652cdaa2 1271 # the id of the playlist is just 'RD' + video_id
7d4afc55 1272 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
c9cc0bf5 1273 webpage = self._download_webpage(
78caa52a 1274 url, playlist_id, 'Downloading Youtube mix')
bc2f773b 1275 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
1276 title_span = (
1277 search_title('playlist-title') or
1278 search_title('title long-title') or
1279 search_title('title'))
76d1700b 1280 title = clean_html(title_span)
c9cc0bf5
PH
1281 ids = orderedSet(re.findall(
1282 r'''(?xs)data-video-username=".*?".*?
1283 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1284 webpage))
652cdaa2
JMF
1285 url_results = self._ids_to_results(ids)
1286
1287 return self.playlist_result(url_results, playlist_id, title)
1288
448830ce 1289 def _extract_playlist(self, playlist_id):
dbb94fb0
S
1290 url = self._TEMPLATE_URL % playlist_id
1291 page = self._download_webpage(url, playlist_id)
1292 more_widget_html = content_html = page
1293
10c0e2d8 1294 # Check if the playlist exists or is private
e399853d 1295 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
10c0e2d8 1296 raise ExtractorError(
78caa52a 1297 'The playlist doesn\'t exist or is private, use --username or '
10c0e2d8
JMF
1298 '--netrc to access it.',
1299 expected=True)
1300
dcbb4580
JMF
1301 # Extract the video ids from the playlist pages
1302 ids = []
c5e8d7af 1303
755eb032 1304 for page_num in itertools.count(1):
dbb94fb0 1305 matches = re.finditer(self._VIDEO_RE, content_html)
6e47b51e
JMF
1306 # We remove the duplicates and the link with index 0
1307 # (it's not the first video of the playlist)
1308 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1309 ids.extend(new_ids)
c5e8d7af 1310
dbb94fb0
S
1311 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1312 if not mobj:
c5e8d7af
PH
1313 break
1314
dbb94fb0 1315 more = self._download_json(
5912c639
PH
1316 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1317 'Downloading page #%s' % page_num,
1318 transform_source=uppercase_escape)
dbb94fb0 1319 content_html = more['content_html']
4b7df0d3
JMF
1320 if not content_html.strip():
1321 # Some webpages show a "Load more" button but they don't
1322 # have more videos
1323 break
dbb94fb0
S
1324 more_widget_html = more['load_more_widget_html']
1325
1326 playlist_title = self._html_search_regex(
68eb8e90 1327 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
78caa52a 1328 page, 'title')
c5e8d7af 1329
652cdaa2 1330 url_results = self._ids_to_results(ids)
dcbb4580 1331 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af 1332
448830ce
S
1333 def _real_extract(self, url):
1334 # Extract playlist id
1335 mobj = re.match(self._VALID_URL, url)
1336 if mobj is None:
1337 raise ExtractorError('Invalid URL: %s' % url)
1338 playlist_id = mobj.group(1) or mobj.group(2)
1339
1340 # Check if it's a video-specific URL
1341 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1342 if 'v' in query_dict:
1343 video_id = query_dict['v'][0]
1344 if self._downloader.params.get('noplaylist'):
1345 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1346 return self.url_result(video_id, 'Youtube', video_id=video_id)
1347 else:
1348 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1349
1350 if playlist_id.startswith('RD') or playlist_id.startswith('UL'):
1351 # Mixes require a custom extraction process
1352 return self._extract_mix(playlist_id)
1353
1354 return self._extract_playlist(playlist_id)
1355
c5e8d7af
PH
1356
1357class YoutubeChannelIE(InfoExtractor):
78caa52a 1358 IE_DESC = 'YouTube.com channels'
9ff67727 1359 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
eb0f3e7e 1360 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
78caa52a 1361 IE_NAME = 'youtube:channel'
cdc628a4
PH
1362 _TESTS = [{
1363 'note': 'paginated channel',
1364 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1365 'playlist_mincount': 91,
acf757f4
PH
1366 'info_dict': {
1367 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
1368 }
cdc628a4 1369 }]
c5e8d7af 1370
6de5dbaf
S
1371 @staticmethod
1372 def extract_videos_from_page(page):
c5e8d7af 1373 ids_in_page = []
fb69240c
S
1374 titles_in_page = []
1375 for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page):
1376 video_id = mobj.group('id')
1377 video_title = unescapeHTML(mobj.group('title'))
1378 try:
1379 idx = ids_in_page.index(video_id)
1380 if video_title and not titles_in_page[idx]:
1381 titles_in_page[idx] = video_title
1382 except ValueError:
1383 ids_in_page.append(video_id)
1384 titles_in_page.append(video_title)
1385 return zip(ids_in_page, titles_in_page)
c5e8d7af
PH
1386
1387 def _real_extract(self, url):
9ff67727 1388 channel_id = self._match_id(url)
c5e8d7af 1389
eb0f3e7e 1390 url = self._TEMPLATE_URL % channel_id
60bf45c8 1391 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
31812a9e
PH
1392 autogenerated = re.search(r'''(?x)
1393 class="[^"]*?(?:
1394 channel-header-autogenerated-label|
1395 yt-channel-title-autogenerated
1396 )[^"]*"''', channel_page) is not None
c5e8d7af 1397
b9643eed
JMF
1398 if autogenerated:
1399 # The videos are contained in a single page
1400 # the ajax pages can't be used, they are empty
b82f815f 1401 entries = [
fb69240c
S
1402 self.url_result(
1403 video_id, 'Youtube', video_id=video_id,
1404 video_title=video_title)
8f02ad4f 1405 for video_id, video_title in self.extract_videos_from_page(channel_page)]
b82f815f
PH
1406 return self.playlist_result(entries, channel_id)
1407
1408 def _entries():
23d3608c 1409 more_widget_html = content_html = channel_page
b9643eed 1410 for pagenum in itertools.count(1):
81c2f20b 1411
8f02ad4f 1412 for video_id, video_title in self.extract_videos_from_page(content_html):
b82f815f 1413 yield self.url_result(
fb69240c
S
1414 video_id, 'Youtube', video_id=video_id,
1415 video_title=video_title)
5f6a1245 1416
23d3608c
JMF
1417 mobj = re.search(
1418 r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
1419 more_widget_html)
1420 if not mobj:
b9643eed 1421 break
c5e8d7af 1422
23d3608c
JMF
1423 more = self._download_json(
1424 'https://youtube.com/%s' % mobj.group('more'), channel_id,
1425 'Downloading page #%s' % (pagenum + 1),
1426 transform_source=uppercase_escape)
1427 content_html = more['content_html']
1428 more_widget_html = more['load_more_widget_html']
1429
b82f815f 1430 return self.playlist_result(_entries(), channel_id)
c5e8d7af
PH
1431
1432
eb0f3e7e 1433class YoutubeUserIE(YoutubeChannelIE):
78caa52a 1434 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
9ff67727 1435 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
eb0f3e7e 1436 _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos'
78caa52a 1437 IE_NAME = 'youtube:user'
c5e8d7af 1438
cdc628a4
PH
1439 _TESTS = [{
1440 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1441 'playlist_mincount': 320,
1442 'info_dict': {
1443 'title': 'TheLinuxFoundation',
1444 }
1445 }, {
1446 'url': 'ytuser:phihag',
1447 'only_matching': True,
1448 }]
1449
e3ea4790 1450 @classmethod
f4b05232 1451 def suitable(cls, url):
e3ea4790
JMF
1452 # Don't return True if the url can be extracted with other youtube
1453 # extractor, the regex would is too permissive and it would match.
1454 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
5f6a1245
JW
1455 if any(ie.suitable(url) for ie in other_ies):
1456 return False
1457 else:
1458 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 1459
b05654f0 1460
b4c08069 1461class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
78caa52a 1462 IE_DESC = 'YouTube.com searches'
b4c08069
JMF
1463 # there doesn't appear to be a real limit, for example if you search for
1464 # 'python' you get more than 8.000.000 results
1465 _MAX_RESULTS = float('inf')
78caa52a 1466 IE_NAME = 'youtube:search'
b05654f0 1467 _SEARCH_KEY = 'ytsearch'
b4c08069 1468 _EXTRA_QUERY_ARGS = {}
9dd8e46a 1469 _TESTS = []
b05654f0 1470
b05654f0
PH
1471 def _get_n_results(self, query, n):
1472 """Get a specified number of results for a query"""
1473
b4c08069 1474 videos = []
b05654f0
PH
1475 limit = n
1476
b4c08069
JMF
1477 for pagenum in itertools.count(1):
1478 url_query = {
1479 'search_query': query,
1480 'page': pagenum,
1481 'spf': 'navigate',
1482 }
1483 url_query.update(self._EXTRA_QUERY_ARGS)
1484 result_url = 'https://www.youtube.com/results?' + compat_urllib_parse.urlencode(url_query)
1485 data = self._download_json(
69ea8ca4 1486 result_url, video_id='query "%s"' % query,
b4c08069 1487 note='Downloading page %s' % pagenum,
69ea8ca4 1488 errnote='Unable to download API page')
b4c08069 1489 html_content = data[1]['body']['content']
7cc3570e 1490
b4c08069 1491 if 'class="search-message' in html_content:
07ad22b8 1492 raise ExtractorError(
78caa52a 1493 '[youtube] No video results', expected=True)
b05654f0 1494
b4c08069
JMF
1495 new_videos = self._ids_to_results(orderedSet(re.findall(
1496 r'href="/watch\?v=(.{11})', html_content)))
1497 videos += new_videos
1498 if not new_videos or len(videos) > limit:
1499 break
b05654f0 1500
b4c08069
JMF
1501 if len(videos) > n:
1502 videos = videos[:n]
b05654f0 1503 return self.playlist_result(videos, query)
75dff0ee 1504
c9ae7b95 1505
a3dd9248 1506class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1507 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248 1508 _SEARCH_KEY = 'ytsearchdate'
78caa52a 1509 IE_DESC = 'YouTube.com searches, newest videos first'
b4c08069 1510 _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
75dff0ee 1511
c9ae7b95
PH
1512
1513class YoutubeSearchURLIE(InfoExtractor):
78caa52a
PH
1514 IE_DESC = 'YouTube.com search URLs'
1515 IE_NAME = 'youtube:search_url'
c9ae7b95 1516 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
1517 _TESTS = [{
1518 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1519 'playlist_mincount': 5,
1520 'info_dict': {
1521 'title': 'youtube-dl test video',
1522 }
1523 }]
c9ae7b95
PH
1524
1525 def _real_extract(self, url):
1526 mobj = re.match(self._VALID_URL, url)
1527 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1528
1529 webpage = self._download_webpage(url, query)
1530 result_code = self._search_regex(
98998cde 1531 r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML')
c9ae7b95
PH
1532
1533 part_codes = re.findall(
1534 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1535 entries = []
1536 for part_code in part_codes:
1537 part_title = self._html_search_regex(
6feb2d5e 1538 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1539 part_url_snippet = self._html_search_regex(
1540 r'(?s)href="([^"]+)"', part_code, 'item URL')
1541 part_url = compat_urlparse.urljoin(
1542 'https://www.youtube.com/', part_url_snippet)
1543 entries.append({
1544 '_type': 'url',
1545 'url': part_url,
1546 'title': part_title,
1547 })
1548
1549 return {
1550 '_type': 'playlist',
1551 'entries': entries,
1552 'title': query,
1553 }
1554
1555
75dff0ee 1556class YoutubeShowIE(InfoExtractor):
78caa52a 1557 IE_DESC = 'YouTube.com (multi-season) shows'
cdc628a4 1558 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 1559 IE_NAME = 'youtube:show'
cdc628a4
PH
1560 _TESTS = [{
1561 'url': 'http://www.youtube.com/show/airdisasters',
1562 'playlist_mincount': 3,
1563 'info_dict': {
1564 'id': 'airdisasters',
1565 'title': 'Air Disasters',
1566 }
1567 }]
75dff0ee
JMF
1568
1569 def _real_extract(self, url):
1570 mobj = re.match(self._VALID_URL, url)
cdc628a4
PH
1571 playlist_id = mobj.group('id')
1572 webpage = self._download_webpage(
1573 url, playlist_id, 'Downloading show webpage')
75dff0ee
JMF
1574 # There's one playlist for each season of the show
1575 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
cdc628a4
PH
1576 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1577 entries = [
1578 self.url_result(
1579 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1580 for season in m_seasons
1581 ]
1582 title = self._og_search_title(webpage, fatal=False)
1583
1584 return {
1585 '_type': 'playlist',
1586 'id': playlist_id,
1587 'title': title,
1588 'entries': entries,
1589 }
04cc9617
JMF
1590
1591
b2e8bc1b 1592class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1593 """
1594 Base class for extractors that fetch info from
1595 http://www.youtube.com/feed_ajax
1596 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1597 """
b2e8bc1b 1598 _LOGIN_REQUIRED = True
43ba5456
JMF
1599 # use action_load_personal_feed instead of action_load_system_feed
1600 _PERSONAL_FEED = False
04cc9617 1601
d7ae0639
JMF
1602 @property
1603 def _FEED_TEMPLATE(self):
43ba5456
JMF
1604 action = 'action_load_system_feed'
1605 if self._PERSONAL_FEED:
1606 action = 'action_load_personal_feed'
38c2e5b8 1607 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1608
1609 @property
1610 def IE_NAME(self):
78caa52a 1611 return 'youtube:%s' % self._FEED_NAME
04cc9617 1612
81f0259b 1613 def _real_initialize(self):
b2e8bc1b 1614 self._login()
81f0259b 1615
04cc9617
JMF
1616 def _real_extract(self, url):
1617 feed_entries = []
0e44d838
JMF
1618 paging = 0
1619 for i in itertools.count(1):
84d84211
PH
1620 info = self._download_json(
1621 self._FEED_TEMPLATE % paging,
1622 '%s feed' % self._FEED_NAME,
1623 'Downloading page %s' % i,
1624 transform_source=uppercase_escape)
f6177462 1625 feed_html = info.get('feed_html') or info.get('content_html')
1a9b9649 1626 load_more_widget_html = info.get('load_more_widget_html') or feed_html
43ba5456 1627 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1628 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1629 feed_entries.extend(
1630 self.url_result(video_id, 'Youtube', video_id=video_id)
1631 for video_id in ids)
05ee2b6d
JMF
1632 mobj = re.search(
1633 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1a9b9649 1634 load_more_widget_html)
05ee2b6d 1635 if mobj is None:
04cc9617 1636 break
05ee2b6d 1637 paging = mobj.group('paging')
d7ae0639
JMF
1638 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1639
5f6a1245 1640
d7ae0639 1641class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
7e17ec8c 1642 IE_NAME = 'youtube:recommended'
f3a34072 1643 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
d7ae0639
JMF
1644 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1645 _FEED_NAME = 'recommended'
78caa52a 1646 _PLAYLIST_TITLE = 'Youtube Recommended videos'
c626a3d9 1647
5f6a1245 1648
448830ce 1649class YoutubeWatchLaterIE(YoutubePlaylistIE):
7e17ec8c 1650 IE_NAME = 'youtube:watchlater'
f3a34072 1651 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
448830ce 1652 _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'
157e9e5a
JMF
1653
1654 _TESTS = [] # override PlaylistIE tests
c626a3d9 1655
448830ce
S
1656 def _real_extract(self, url):
1657 return self._extract_playlist('WL')
1658
5f6a1245 1659
f459d170 1660class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
7e17ec8c 1661 IE_NAME = 'youtube:history'
f3a34072 1662 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
78caa52a 1663 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
f459d170
JMF
1664 _FEED_NAME = 'history'
1665 _PERSONAL_FEED = True
78caa52a 1666 _PLAYLIST_TITLE = 'Youtube Watch History'
f459d170 1667
5f6a1245 1668
c626a3d9 1669class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 1670 IE_NAME = 'youtube:favorites'
f3a34072 1671 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
c7a7750d 1672 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1673 _LOGIN_REQUIRED = True
1674
1675 def _real_extract(self, url):
1676 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 1677 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 1678 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1679
1680
1ed5b5c9 1681class YoutubeSubscriptionsIE(YoutubePlaylistIE):
78caa52a
PH
1682 IE_NAME = 'youtube:subscriptions'
1683 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1ed5b5c9 1684 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
81127aa5 1685 _TESTS = []
1ed5b5c9
JMF
1686
1687 def _real_extract(self, url):
78caa52a 1688 title = 'Youtube Subscriptions'
1ed5b5c9
JMF
1689 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1690
1691 # The extraction process is the same as for playlists, but the regex
1692 # for the video ids doesn't contain an index
1693 ids = []
1694 more_widget_html = content_html = page
1695
1696 for page_num in itertools.count(1):
1697 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1698 new_ids = orderedSet(matches)
1699 ids.extend(new_ids)
1700
1701 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1702 if not mobj:
1703 break
1704
1705 more = self._download_json(
1706 'https://youtube.com/%s' % mobj.group('more'), title,
1707 'Downloading page #%s' % page_num,
1708 transform_source=uppercase_escape)
1709 content_html = more['content_html']
1710 more_widget_html = more['load_more_widget_html']
1711
1712 return {
1713 '_type': 'playlist',
1714 'title': title,
1715 'entries': self._ids_to_results(ids),
1716 }
1717
1718
15870e90
PH
1719class YoutubeTruncatedURLIE(InfoExtractor):
1720 IE_NAME = 'youtube:truncated_url'
1721 IE_DESC = False # Do not list
975d35db 1722 _VALID_URL = r'''(?x)
b95aab84
PH
1723 (?:https?://)?
1724 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
1725 (?:watch\?(?:
c4808c60 1726 feature=[a-z_]+|
b95aab84
PH
1727 annotation_id=annotation_[^&]+|
1728 x-yt-cl=[0-9]+|
c1708b89 1729 hl=[^&]*|
b95aab84
PH
1730 )?
1731 |
1732 attribution_link\?a=[^&]+
1733 )
1734 $
975d35db 1735 '''
15870e90 1736
c4808c60
PH
1737 _TESTS = [{
1738 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1739 'only_matching': True,
dc2fc736
PH
1740 }, {
1741 'url': 'http://www.youtube.com/watch?',
1742 'only_matching': True,
b95aab84
PH
1743 }, {
1744 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
1745 'only_matching': True,
1746 }, {
1747 'url': 'https://www.youtube.com/watch?feature=foo',
1748 'only_matching': True,
c1708b89
PH
1749 }, {
1750 'url': 'https://www.youtube.com/watch?hl=en-GB',
1751 'only_matching': True,
c4808c60
PH
1752 }]
1753
15870e90
PH
1754 def _real_extract(self, url):
1755 raise ExtractorError(
78caa52a
PH
1756 'Did you forget to quote the URL? Remember that & is a meta '
1757 'character in most shells, so you want to put the URL in quotes, '
1758 'like youtube-dl '
1759 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1760 ' or simply youtube-dl BaW_jenozKc .',
15870e90 1761 expected=True)
772fd5cc
PH
1762
1763
1764class YoutubeTruncatedIDIE(InfoExtractor):
1765 IE_NAME = 'youtube:truncated_id'
1766 IE_DESC = False # Do not list
b95aab84 1767 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
1768
1769 _TESTS = [{
1770 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
1771 'only_matching': True,
1772 }]
1773
1774 def _real_extract(self, url):
1775 video_id = self._match_id(url)
1776 raise ExtractorError(
1777 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
1778 expected=True)