]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[youtube] Add test for #5361
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
42939b61 10import time
e0df6211 11import traceback
c5e8d7af 12
b05654f0 13from .common import InfoExtractor, SearchInfoExtractor
2b25cb5d 14from ..jsinterp import JSInterpreter
54256267 15from ..swfinterp import SWFInterpreter
4bb4a188 16from ..compat import (
edf3e38e 17 compat_chr,
c5e8d7af 18 compat_parse_qs,
c5e8d7af
PH
19 compat_urllib_parse,
20 compat_urllib_request,
7c61bd36 21 compat_urlparse,
c5e8d7af 22 compat_str,
4bb4a188
PH
23)
24from ..utils import (
c5e8d7af 25 clean_html,
c5e8d7af 26 ExtractorError,
2d30521a 27 float_or_none,
4bb4a188
PH
28 get_element_by_attribute,
29 get_element_by_id,
dd27fd17 30 int_or_none,
9c44d242 31 OnDemandPagedList,
4bb4a188 32 orderedSet,
c5e8d7af
PH
33 unescapeHTML,
34 unified_strdate,
81c2f20b 35 uppercase_escape,
c5e8d7af
PH
36)
37
5f6a1245 38
de7f3446 39class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
83317f69 42 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
b2e8bc1b
JMF
43 _NETRC_MACHINE = 'youtube'
44 # If True it will raise an error if no login info is provided
45 _LOGIN_REQUIRED = False
46
b2e8bc1b 47 def _set_language(self):
810fb84d
PH
48 self._set_cookie(
49 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
42939b61 50 # YouTube sets the expire time to about two months
810fb84d 51 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b
JMF
52
53 def _login(self):
83317f69 54 """
55 Attempt to log in to YouTube.
56 True is returned if successful or skipped.
57 False is returned if login failed.
58
59 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
60 """
b2e8bc1b
JMF
61 (username, password) = self._get_login_info()
62 # No authentication to be performed
63 if username is None:
64 if self._LOGIN_REQUIRED:
69ea8ca4 65 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 66 return True
b2e8bc1b 67
7cc3570e
PH
68 login_page = self._download_webpage(
69 self._LOGIN_URL, None,
69ea8ca4
PH
70 note='Downloading login page',
71 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
72 if login_page is False:
73 return
b2e8bc1b 74
795f28f8 75 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78caa52a 76 login_page, 'Login GALX parameter')
c5e8d7af 77
b2e8bc1b
JMF
78 # Log in
79 login_form_strs = {
8bcc8756
JW
80 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
81 'Email': username,
82 'GALX': galx,
83 'Passwd': password,
84
85 'PersistentCookie': 'yes',
86 '_utf8': '霱',
87 'bgresponse': 'js_disabled',
88 'checkConnection': '',
89 'checkedDomains': 'youtube',
90 'dnConn': '',
91 'pstMsg': '0',
92 'rmShown': '1',
93 'secTok': '',
94 'signIn': 'Sign in',
95 'timeStmp': '',
96 'service': 'youtube',
97 'uilel': '3',
98 'hl': 'en_US',
b2e8bc1b 99 }
83317f69 100
b2e8bc1b
JMF
101 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
102 # chokes on unicode
5f6a1245 103 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
b2e8bc1b 104 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
105
106 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
107 login_results = self._download_webpage(
108 req, None,
69ea8ca4 109 note='Logging in', errnote='unable to log in', fatal=False)
7cc3570e
PH
110 if login_results is False:
111 return False
83317f69 112
113 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
69ea8ca4 114 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
83317f69 115
116 # Two-Factor
117 # TODO add SMS and phone call support - these require making a request and then prompting the user
118
119 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
120 tfa_code = self._get_tfa_info()
121
122 if tfa_code is None:
69ea8ca4
PH
123 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
124 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
83317f69 125 return False
126
127 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
128
129 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
130 if match is None:
69ea8ca4 131 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
83317f69 132 secTok = match.group(1)
133 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
134 if match is None:
69ea8ca4 135 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
83317f69 136 timeStmp = match.group(1)
137
138 tfa_form_strs = {
78caa52a
PH
139 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
140 'smsToken': '',
141 'smsUserPin': tfa_code,
142 'smsVerifyPin': 'Verify',
143
144 'PersistentCookie': 'yes',
145 'checkConnection': '',
146 'checkedDomains': 'youtube',
147 'pstMsg': '1',
148 'secTok': secTok,
149 'timeStmp': timeStmp,
150 'service': 'youtube',
151 'hl': 'en_US',
83317f69 152 }
5f6a1245 153 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
83317f69 154 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
155
156 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
157 tfa_results = self._download_webpage(
158 tfa_req, None,
69ea8ca4 159 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
83317f69 160
161 if tfa_results is False:
162 return False
163
164 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
69ea8ca4 165 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
83317f69 166 return False
167 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
69ea8ca4 168 self._downloader.report_warning('unable to log in - did the page structure change?')
83317f69 169 return False
170 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
69ea8ca4 171 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
83317f69 172 return False
173
7cc3570e 174 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
69ea8ca4 175 self._downloader.report_warning('unable to log in: bad username or password')
b2e8bc1b
JMF
176 return False
177 return True
178
b2e8bc1b
JMF
179 def _real_initialize(self):
180 if self._downloader is None:
181 return
42939b61 182 self._set_language()
b2e8bc1b
JMF
183 if not self._login():
184 return
c5e8d7af 185
8377574c 186
360e1ca5 187class YoutubeIE(YoutubeBaseInfoExtractor):
78caa52a 188 IE_DESC = 'YouTube.com'
cb7dfeea 189 _VALID_URL = r"""(?x)^
c5e8d7af 190 (
edb53e2d 191 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 192 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 193 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 194 (?:www\.)?pwnyoutube\.com/|
f7000f3a 195 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
196 tube\.majestyc\.net/|
197 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
198 (?:.*?\#/)? # handle anchor (#/) redirect urls
199 (?: # the various things that can precede the ID:
ac7553d0 200 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 201 |(?: # or the v= param in all its forms
f7000f3a 202 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
203 (?:\?|\#!?) # the params delimiter ? or # or #!
204 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
205 v=
206 )
f4b05232
JMF
207 ))
208 |youtu\.be/ # just youtu.be/xxxx
edb53e2d 209 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 210 )
c5e8d7af 211 )? # all until now is optional -> you can pass the naked ID
8963d9c2 212 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
9291475f 213 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
c5e8d7af
PH
214 (?(1).+)? # if we found the ID, everything can follow
215 $"""
c5e8d7af 216 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
217 _formats = {
218 '5': {'ext': 'flv', 'width': 400, 'height': 240},
219 '6': {'ext': 'flv', 'width': 450, 'height': 270},
220 '13': {'ext': '3gp'},
221 '17': {'ext': '3gp', 'width': 176, 'height': 144},
222 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
223 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
224 '34': {'ext': 'flv', 'width': 640, 'height': 360},
225 '35': {'ext': 'flv', 'width': 854, 'height': 480},
226 '36': {'ext': '3gp', 'width': 320, 'height': 240},
227 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
228 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
229 '43': {'ext': 'webm', 'width': 640, 'height': 360},
230 '44': {'ext': 'webm', 'width': 854, 'height': 480},
231 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
232 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
233
1d043b93 234
86fe61c8 235 # 3d videos
43b81eb9
PH
236 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
237 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
238 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
239 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
240 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
241 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
242 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 243
96fb5605 244 # Apple HTTP Live Streaming
43b81eb9
PH
245 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
246 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
247 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
248 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
249 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
250 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
251 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
252
253 # DASH mp4 video
43b81eb9
PH
254 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
255 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
256 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
257 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
258 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
e65566a9 259 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
43b81eb9
PH
260 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
261 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
0d2c1418
PH
262 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
263 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
264 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
836a086c 265
f6f1fc92 266 # Dash mp4 audio
62cd676c
PH
267 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
268 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
269 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
836a086c
AZ
270
271 # Dash webm
e75cafe9
A
272 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
273 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
274 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
275 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
276 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
277 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
1cc887cb 278 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
e75cafe9
A
279 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
280 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
281 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
282 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
283 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
284 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
285 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 286 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 287 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
a51d3aa0
PH
288 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
289 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
76b3c610 290 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
10a404c3 291 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
76b3c610 292 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
2c62dc26
PH
293
294 # Dash webm audio
55db73ef 295 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
e75cafe9 296 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d 297
0857baad
PH
298 # Dash webm audio with opus inside
299 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
300 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
301 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
302
ce6b9a2d
PH
303 # RTMP (unnamed)
304 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 305 }
836a086c 306
78caa52a 307 IE_NAME = 'youtube'
2eb88d95
PH
308 _TESTS = [
309 {
4bc3a23e
PH
310 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
311 'info_dict': {
312 'id': 'BaW_jenozKc',
313 'ext': 'mp4',
314 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
315 'uploader': 'Philipp Hagemeister',
316 'uploader_id': 'phihag',
317 'upload_date': '20121002',
318 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
319 'categories': ['Science & Technology'],
3e7c1224
PH
320 'like_count': int,
321 'dislike_count': int,
2eb88d95 322 }
0e853ca4 323 },
0e853ca4 324 {
4bc3a23e
PH
325 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
326 'note': 'Test generic use_cipher_signature video (#897)',
327 'info_dict': {
328 'id': 'UxxajLWwzqY',
329 'ext': 'mp4',
330 'upload_date': '20120506',
331 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
332 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
333 'uploader': 'Icona Pop',
334 'uploader_id': 'IconaPop',
2eb88d95 335 }
c108eb73
JMF
336 },
337 {
4bc3a23e
PH
338 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
339 'note': 'Test VEVO video with age protection (#956)',
340 'info_dict': {
341 'id': '07FYdnEawAQ',
342 'ext': 'mp4',
343 'upload_date': '20130703',
344 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
345 'description': 'md5:64249768eec3bc4276236606ea996373',
346 'uploader': 'justintimberlakeVEVO',
347 'uploader_id': 'justintimberlakeVEVO',
c108eb73
JMF
348 }
349 },
fccd3771 350 {
4bc3a23e
PH
351 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
352 'note': 'Embed-only video (#1746)',
353 'info_dict': {
354 'id': 'yZIXLfi8CZQ',
355 'ext': 'mp4',
356 'upload_date': '20120608',
357 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
358 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
359 'uploader': 'SET India',
360 'uploader_id': 'setindia'
fccd3771
PH
361 }
362 },
dd27fd17 363 {
4bc3a23e
PH
364 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
365 'note': '256k DASH audio (format 141) via DASH manifest',
366 'info_dict': {
367 'id': 'a9LDPn-MO4I',
368 'ext': 'm4a',
369 'upload_date': '20121002',
370 'uploader_id': '8KVIDEO',
371 'description': '',
372 'uploader': '8KVIDEO',
373 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 374 },
4bc3a23e
PH
375 'params': {
376 'youtube_include_dash_manifest': True,
377 'format': '141',
4919603f 378 },
dd27fd17 379 },
3489b7d2
JMF
380 # DASH manifest with encrypted signature
381 {
78caa52a
PH
382 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
383 'info_dict': {
384 'id': 'IB3lcPjvWLA',
385 'ext': 'm4a',
b766eb27
JMF
386 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
387 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
78caa52a
PH
388 'uploader': 'AfrojackVEVO',
389 'uploader_id': 'AfrojackVEVO',
390 'upload_date': '20131011',
3489b7d2 391 },
4bc3a23e 392 'params': {
78caa52a
PH
393 'youtube_include_dash_manifest': True,
394 'format': '141',
3489b7d2
JMF
395 },
396 },
aaeb86f6
S
397 # JS player signature function name containing $
398 {
399 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
400 'info_dict': {
401 'id': 'nfWlot6h_JM',
402 'ext': 'm4a',
403 'title': 'Taylor Swift - Shake It Off',
404 'description': 'md5:2acfda1b285bdd478ccec22f9918199d',
405 'uploader': 'TaylorSwiftVEVO',
406 'uploader_id': 'TaylorSwiftVEVO',
407 'upload_date': '20140818',
408 },
409 'params': {
410 'youtube_include_dash_manifest': True,
411 'format': '141',
412 },
413 },
aa79ac0c
PH
414 # Controversy video
415 {
416 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
417 'info_dict': {
418 'id': 'T4XJQO3qol8',
419 'ext': 'mp4',
420 'upload_date': '20100909',
421 'uploader': 'The Amazing Atheist',
422 'uploader_id': 'TheAmazingAtheist',
423 'title': 'Burning Everyone\'s Koran',
424 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
425 }
c522adb1
JMF
426 },
427 # Normal age-gate video (No vevo, embed allowed)
428 {
429 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
430 'info_dict': {
431 'id': 'HtVdAasjOgU',
432 'ext': 'mp4',
433 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
9ed99402 434 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
c522adb1
JMF
435 'uploader': 'The Witcher',
436 'uploader_id': 'WitcherGame',
437 'upload_date': '20140605',
438 },
439 },
fccae2b9
S
440 # Age-gate video with encrypted signature
441 {
442 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',
443 'info_dict': {
444 'id': '6kLq3WMV1nU',
445 'ext': 'mp4',
446 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
447 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
448 'uploader': 'LloydVEVO',
449 'uploader_id': 'LloydVEVO',
450 'upload_date': '20110629',
451 },
452 },
774e208f
PH
453 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
454 {
455 'url': '__2ABJjxzNo',
456 'info_dict': {
457 'id': '__2ABJjxzNo',
458 'ext': 'mp4',
459 'upload_date': '20100430',
460 'uploader_id': 'deadmau5',
461 'description': 'md5:12c56784b8032162bb936a5f76d55360',
462 'uploader': 'deadmau5',
463 'title': 'Deadmau5 - Some Chords (HD)',
464 },
465 'expected_warnings': [
466 'DASH manifest missing',
467 ]
e52a40ab
PH
468 },
469 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
470 {
471 'url': 'lqQg6PlCWgI',
472 'info_dict': {
473 'id': 'lqQg6PlCWgI',
474 'ext': 'mp4',
cbe2bd91
PH
475 'upload_date': '20120731',
476 'uploader_id': 'olympic',
477 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
478 'uploader': 'Olympics',
479 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
480 },
481 'params': {
482 'skip_download': 'requires avconv',
e52a40ab 483 }
cbe2bd91 484 },
6271f1ca
PH
485 # Non-square pixels
486 {
487 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
488 'info_dict': {
489 'id': '_b-2C3KPAM0',
490 'ext': 'mp4',
491 'stretched_ratio': 16 / 9.,
492 'upload_date': '20110310',
493 'uploader_id': 'AllenMeow',
494 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
495 'uploader': '孫艾倫',
496 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
497 },
06b491eb
S
498 },
499 # url_encoded_fmt_stream_map is empty string
500 {
501 'url': 'qEJwOuvDf7I',
502 'info_dict': {
503 'id': 'qEJwOuvDf7I',
504 'ext': 'mp4',
505 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
506 'description': '',
507 'upload_date': '20150404',
508 'uploader_id': 'spbelect',
509 'uploader': 'Наблюдатели Петербурга',
510 },
511 'params': {
512 'skip_download': 'requires avconv',
513 }
514 },
2eb88d95
PH
515 ]
516
e0df6211
PH
517 def __init__(self, *args, **kwargs):
518 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 519 self._player_cache = {}
e0df6211 520
c5e8d7af
PH
521 def report_video_info_webpage_download(self, video_id):
522 """Report attempt to download video info webpage."""
69ea8ca4 523 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 524
c5e8d7af
PH
525 def report_information_extraction(self, video_id):
526 """Report attempt to extract video information."""
69ea8ca4 527 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
528
529 def report_unavailable_format(self, video_id, format):
530 """Report extracted video URL."""
69ea8ca4 531 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
532
533 def report_rtmp_download(self):
534 """Indicate the download will use the RTMP protocol."""
69ea8ca4 535 self.to_screen('RTMP download detected')
c5e8d7af 536
60064c53
PH
537 def _signature_cache_id(self, example_sig):
538 """ Return a string representation of a signature """
78caa52a 539 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
540
541 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 542 id_m = re.match(
60620368 543 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
cf010131 544 player_url)
c081b35c
PH
545 if not id_m:
546 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
547 player_type = id_m.group('ext')
548 player_id = id_m.group('id')
549
c4417ddb 550 # Read from filesystem cache
60064c53
PH
551 func_id = '%s_%s_%s' % (
552 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 553 assert os.path.basename(func_id) == func_id
a0e07d31 554
69ea8ca4 555 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 556 if cache_spec is not None:
78caa52a 557 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 558
6d1a55a5
PH
559 download_note = (
560 'Downloading player %s' % player_url
561 if self._downloader.params.get('verbose') else
562 'Downloading %s player %s' % (player_type, player_id)
563 )
e0df6211
PH
564 if player_type == 'js':
565 code = self._download_webpage(
566 player_url, video_id,
6d1a55a5 567 note=download_note,
69ea8ca4 568 errnote='Download of %s failed' % player_url)
83799698 569 res = self._parse_sig_js(code)
c4417ddb 570 elif player_type == 'swf':
e0df6211
PH
571 urlh = self._request_webpage(
572 player_url, video_id,
6d1a55a5 573 note=download_note,
69ea8ca4 574 errnote='Download of %s failed' % player_url)
e0df6211 575 code = urlh.read()
83799698 576 res = self._parse_sig_swf(code)
e0df6211
PH
577 else:
578 assert False, 'Invalid player type %r' % player_type
579
785521bf
PH
580 test_string = ''.join(map(compat_chr, range(len(example_sig))))
581 cache_res = res(test_string)
582 cache_spec = [ord(c) for c in cache_res]
83799698 583
69ea8ca4 584 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
585 return res
586
60064c53 587 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
588 def gen_sig_code(idxs):
589 def _genslice(start, end, step):
78caa52a 590 starts = '' if start == 0 else str(start)
8bcc8756 591 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 592 steps = '' if step == 1 else (':%d' % step)
78caa52a 593 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
594
595 step = None
7af808a5
PH
596 # Quelch pyflakes warnings - start will be set when step is set
597 start = '(Never used)'
edf3e38e
PH
598 for i, prev in zip(idxs[1:], idxs[:-1]):
599 if step is not None:
600 if i - prev == step:
601 continue
602 yield _genslice(start, prev, step)
603 step = None
604 continue
605 if i - prev in [-1, 1]:
606 step = i - prev
607 start = prev
608 continue
609 else:
78caa52a 610 yield 's[%d]' % prev
edf3e38e 611 if step is None:
78caa52a 612 yield 's[%d]' % i
edf3e38e
PH
613 else:
614 yield _genslice(start, i, step)
615
78caa52a 616 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 617 cache_res = func(test_string)
edf3e38e 618 cache_spec = [ord(c) for c in cache_res]
78caa52a 619 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
620 signature_id_tuple = '(%s)' % (
621 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 622 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 623 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 624 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 625
e0df6211
PH
626 def _parse_sig_js(self, jscode):
627 funcname = self._search_regex(
aaeb86f6 628 r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode,
8bcc8756 629 'Initial JS player signature function name')
2b25cb5d
PH
630
631 jsi = JSInterpreter(jscode)
632 initial_function = jsi.extract_function(funcname)
e0df6211
PH
633 return lambda s: initial_function([s])
634
635 def _parse_sig_swf(self, file_contents):
54256267 636 swfi = SWFInterpreter(file_contents)
78caa52a 637 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 638 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 639 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
640 return lambda s: initial_function([s])
641
83799698 642 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 643 """Turn the encrypted s field into a working signature"""
6b37f0be 644
c8bf86d5 645 if player_url is None:
69ea8ca4 646 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 647
69ea8ca4 648 if player_url.startswith('//'):
78caa52a 649 player_url = 'https:' + player_url
c8bf86d5 650 try:
62af3a0e 651 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
652 if player_id not in self._player_cache:
653 func = self._extract_signature_function(
60064c53 654 video_id, player_url, s
c8bf86d5
PH
655 )
656 self._player_cache[player_id] = func
657 func = self._player_cache[player_id]
658 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 659 self._print_sig_code(func, s)
c8bf86d5
PH
660 return func(s)
661 except Exception as e:
662 tb = traceback.format_exc()
663 raise ExtractorError(
78caa52a 664 'Signature extraction failed: ' + tb, cause=e)
e0df6211 665
360e1ca5 666 def _get_subtitles(self, video_id, webpage):
de7f3446 667 try:
60e47a26 668 subs_doc = self._download_xml(
38c2e5b8 669 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
670 video_id, note=False)
671 except ExtractorError as err:
69ea8ca4 672 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
de7f3446 673 return {}
de7f3446
JMF
674
675 sub_lang_list = {}
60e47a26
JMF
676 for track in subs_doc.findall('track'):
677 lang = track.attrib['lang_code']
7e660ac1
LD
678 if lang in sub_lang_list:
679 continue
360e1ca5
JMF
680 sub_formats = []
681 for ext in ['sbv', 'vtt', 'srt']:
682 params = compat_urllib_parse.urlencode({
683 'lang': lang,
684 'v': video_id,
685 'fmt': ext,
686 'name': track.attrib['name'].encode('utf-8'),
687 })
688 sub_formats.append({
689 'url': 'https://www.youtube.com/api/timedtext?' + params,
690 'ext': ext,
691 })
692 sub_lang_list[lang] = sub_formats
de7f3446 693 if not sub_lang_list:
69ea8ca4 694 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
695 return {}
696 return sub_lang_list
697
360e1ca5 698 def _get_automatic_captions(self, video_id, webpage):
de7f3446
JMF
699 """We need the webpage for getting the captions url, pass it as an
700 argument to speed up the process."""
69ea8ca4 701 self.to_screen('%s: Looking for automatic captions' % video_id)
de7f3446 702 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
78caa52a 703 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
704 if mobj is None:
705 self._downloader.report_warning(err_msg)
706 return {}
707 player_config = json.loads(mobj.group(1))
708 try:
0792d563
PH
709 args = player_config['args']
710 caption_url = args['ttsurl']
711 timestamp = args['timestamp']
055e6f36
JMF
712 # We get the available subtitles
713 list_params = compat_urllib_parse.urlencode({
714 'type': 'list',
715 'tlangs': 1,
716 'asrs': 1,
de7f3446 717 })
055e6f36 718 list_url = caption_url + '&' + list_params
e26f8712 719 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 720 original_lang_node = caption_list.find('track')
7d900ef1 721 if original_lang_node is None:
69ea8ca4 722 self._downloader.report_warning('Video doesn\'t have automatic captions')
e3dc22ca
JMF
723 return {}
724 original_lang = original_lang_node.attrib['lang_code']
7d900ef1 725 caption_kind = original_lang_node.attrib.get('kind', '')
055e6f36
JMF
726
727 sub_lang_list = {}
728 for lang_node in caption_list.findall('target'):
729 sub_lang = lang_node.attrib['lang_code']
360e1ca5
JMF
730 sub_formats = []
731 for ext in ['sbv', 'vtt', 'srt']:
732 params = compat_urllib_parse.urlencode({
733 'lang': original_lang,
734 'tlang': sub_lang,
735 'fmt': ext,
736 'ts': timestamp,
737 'kind': caption_kind,
738 })
739 sub_formats.append({
740 'url': caption_url + '&' + params,
741 'ext': ext,
742 })
743 sub_lang_list[sub_lang] = sub_formats
055e6f36 744 return sub_lang_list
de7f3446
JMF
745 # An extractor error can be raise by the download process if there are
746 # no automatic captions but there are subtitles
747 except (KeyError, ExtractorError):
748 self._downloader.report_warning(err_msg)
749 return {}
750
97665381
PH
751 @classmethod
752 def extract_id(cls, url):
753 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 754 if mobj is None:
69ea8ca4 755 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
756 video_id = mobj.group(2)
757 return video_id
758
1d043b93
JMF
759 def _extract_from_m3u8(self, manifest_url, video_id):
760 url_map = {}
5f6a1245 761
1d043b93
JMF
762 def _get_urls(_manifest):
763 lines = _manifest.split('\n')
764 urls = filter(lambda l: l and not l.startswith('#'),
8bcc8756 765 lines)
1d043b93 766 return urls
78caa52a 767 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
1d043b93
JMF
768 formats_urls = _get_urls(manifest)
769 for format_url in formats_urls:
890f62e8 770 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
771 url_map[itag] = format_url
772 return url_map
773
1fb07d10
JG
774 def _extract_annotations(self, video_id):
775 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 776 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 777
da276600
PH
778 def _parse_dash_manifest(
779 self, video_id, dash_manifest_url, player_url, age_gate):
774e208f
PH
780 def decrypt_sig(mobj):
781 s = mobj.group(1)
782 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
783 return '/signature/%s' % dec_s
784 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
785 dash_doc = self._download_xml(
786 dash_manifest_url, video_id,
787 note='Downloading DASH manifest',
788 errnote='Could not download DASH manifest')
789
790 formats = []
791 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
792 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
793 if url_el is None:
794 continue
795 format_id = r.attrib['id']
796 video_url = url_el.text
797 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
798 f = {
799 'format_id': format_id,
800 'url': video_url,
801 'width': int_or_none(r.attrib.get('width')),
e65566a9 802 'height': int_or_none(r.attrib.get('height')),
774e208f
PH
803 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
804 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
805 'filesize': filesize,
806 'fps': int_or_none(r.attrib.get('frameRate')),
807 }
808 try:
809 existing_format = next(
810 fo for fo in formats
811 if fo['format_id'] == format_id)
812 except StopIteration:
ba617964
JMF
813 full_info = self._formats.get(format_id, {}).copy()
814 full_info.update(f)
815 formats.append(full_info)
774e208f
PH
816 else:
817 existing_format.update(f)
818 return formats
819
c5e8d7af 820 def _real_extract(self, url):
7e8c0af0 821 proto = (
78caa52a
PH
822 'http' if self._downloader.params.get('prefer_insecure', False)
823 else 'https')
7e8c0af0 824
c5e8d7af
PH
825 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
826 mobj = re.search(self._NEXT_URL_RE, url)
827 if mobj:
7e8c0af0 828 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
97665381 829 video_id = self.extract_id(url)
c5e8d7af
PH
830
831 # Get video webpage
aa79ac0c 832 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
a1f934b1 833 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
834
835 # Attempt to extract SWF player URL
e0df6211 836 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
837 if mobj is not None:
838 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
839 else:
840 player_url = None
841
842 # Get video info
6449cd80 843 embed_webpage = None
c108eb73 844 if re.search(r'player-age-gate-content">', video_webpage) is not None:
c108eb73
JMF
845 age_gate = True
846 # We simulate the access to the video from www.youtube.com/v/{video_id}
847 # this can be viewed without login into Youtube
beb95e77
CL
848 url = proto + '://www.youtube.com/embed/%s' % video_id
849 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
2c57c7fa
JMF
850 data = compat_urllib_parse.urlencode({
851 'video_id': video_id,
852 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934 853 'sts': self._search_regex(
beb95e77 854 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
2c57c7fa 855 })
7e8c0af0 856 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
94bd3613
PH
857 video_info_webpage = self._download_webpage(
858 video_info_url, video_id,
20436c30 859 note='Refetching age-gated info webpage',
94bd3613 860 errnote='unable to download video info webpage')
c5e8d7af 861 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
862 else:
863 age_gate = False
4e62ebe2
JMF
864 try:
865 # Try looking directly into the video webpage
866 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
867 if not mobj:
868 raise ValueError('Could not find ytplayer.config') # caught below
869 json_code = uppercase_escape(mobj.group(1))
870 ytplayer_config = json.loads(json_code)
871 args = ytplayer_config['args']
872 # Convert to the same format returned by compat_parse_qs
873 video_info = dict((k, [v]) for k, v in args.items())
3a9fadd6
YCH
874 if ('url_encoded_fmt_stream_map' not in args or
875 args['url_encoded_fmt_stream_map'] == ''):
4e62ebe2
JMF
876 raise ValueError('No stream_map present') # caught below
877 except ValueError:
878 # We fallback to the get_video_info pages (used by the embed page)
879 self.report_video_info_webpage_download(video_id)
880 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
810fb84d
PH
881 video_info_url = (
882 '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
883 % (proto, video_id, el_type))
884 video_info_webpage = self._download_webpage(
885 video_info_url,
4e62ebe2
JMF
886 video_id, note=False,
887 errnote='unable to download video info webpage')
888 video_info = compat_parse_qs(video_info_webpage)
889 if 'token' in video_info:
890 break
c5e8d7af
PH
891 if 'token' not in video_info:
892 if 'reason' in video_info:
d11271dd 893 raise ExtractorError(
78caa52a 894 'YouTube said: %s' % video_info['reason'][0],
d11271dd 895 expected=True, video_id=video_id)
c5e8d7af 896 else:
d11271dd 897 raise ExtractorError(
78caa52a 898 '"token" parameter not in video info for unknown reason',
d11271dd 899 video_id=video_id)
c5e8d7af 900
1d699755
PH
901 if 'view_count' in video_info:
902 view_count = int(video_info['view_count'][0])
903 else:
904 view_count = None
905
c5e8d7af
PH
906 # Check for "rental" videos
907 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
69ea8ca4 908 raise ExtractorError('"rental" videos not supported')
c5e8d7af
PH
909
910 # Start extracting information
911 self.report_information_extraction(video_id)
912
913 # uploader
914 if 'author' not in video_info:
69ea8ca4 915 raise ExtractorError('Unable to extract uploader name')
c5e8d7af
PH
916 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
917
918 # uploader_id
919 video_uploader_id = None
920 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
921 if mobj is not None:
922 video_uploader_id = mobj.group(1)
923 else:
69ea8ca4 924 self._downloader.report_warning('unable to extract uploader nickname')
c5e8d7af
PH
925
926 # title
a8c6b241 927 if 'title' in video_info:
aa92f063 928 video_title = video_info['title'][0]
a8c6b241 929 else:
69ea8ca4 930 self._downloader.report_warning('Unable to extract video title')
78caa52a 931 video_title = '_'
c5e8d7af
PH
932
933 # thumbnail image
7763b04e
JMF
934 # We try first to get a high quality image:
935 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
936 video_webpage, re.DOTALL)
937 if m_thumb is not None:
938 video_thumbnail = m_thumb.group(1)
939 elif 'thumbnail_url' not in video_info:
69ea8ca4 940 self._downloader.report_warning('unable to extract video thumbnail')
f490e77e 941 video_thumbnail = None
c5e8d7af
PH
942 else: # don't panic if we can't find it
943 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
944
945 # upload date
946 upload_date = None
ad3bc6ac 947 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
beee53de
PH
948 if mobj is None:
949 mobj = re.search(
263bd4ec 950 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
beee53de 951 video_webpage)
c5e8d7af
PH
952 if mobj is not None:
953 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
954 upload_date = unified_strdate(upload_date)
955
55f7bd2d
PH
956 m_cat_container = self._search_regex(
957 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
624dcebf 958 video_webpage, 'categories', default=None)
ec8deefc 959 if m_cat_container:
ad3bc6ac 960 category = self._html_search_regex(
01ed5c9b 961 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
962 default=None)
963 video_categories = None if category is None else [category]
964 else:
965 video_categories = None
ec8deefc 966
c5e8d7af
PH
967 # description
968 video_description = get_element_by_id("eow-description", video_webpage)
969 if video_description:
27dcce19
PH
970 video_description = re.sub(r'''(?x)
971 <a\s+
972 (?:[a-zA-Z-]+="[^"]+"\s+)*?
973 title="([^"]+)"\s+
974 (?:[a-zA-Z-]+="[^"]+"\s+)*?
975 class="yt-uix-redirect-link"\s*>
976 [^<]+
977 </a>
978 ''', r'\1', video_description)
c5e8d7af
PH
979 video_description = clean_html(video_description)
980 else:
981 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
982 if fd_mobj:
983 video_description = unescapeHTML(fd_mobj.group(1))
984 else:
78caa52a 985 video_description = ''
c5e8d7af 986
f30a38be 987 def _extract_count(count_name):
46374a56 988 count = self._search_regex(
f30a38be
JMF
989 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
990 video_webpage, count_name, default=None)
336c3a69
JMF
991 if count is not None:
992 return int(count.replace(',', ''))
993 return None
69ea8ca4
PH
994 like_count = _extract_count('like')
995 dislike_count = _extract_count('dislike')
336c3a69 996
c5e8d7af 997 # subtitles
d82134c3 998 video_subtitles = self.extract_subtitles(video_id, video_webpage)
360e1ca5 999 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
c5e8d7af
PH
1000
1001 if 'length_seconds' not in video_info:
69ea8ca4 1002 self._downloader.report_warning('unable to extract video duration')
b466b702 1003 video_duration = None
c5e8d7af 1004 else:
b466b702 1005 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1006
1fb07d10
JG
1007 # annotations
1008 video_annotations = None
1009 if self._downloader.params.get('writeannotations', False):
5f6a1245 1010 video_annotations = self._extract_annotations(video_id)
1fb07d10 1011
dd27fd17
PH
1012 def _map_to_format_list(urlmap):
1013 formats = []
1014 for itag, video_real_url in urlmap.items():
1015 dct = {
1016 'format_id': itag,
1017 'url': video_real_url,
1018 'player_url': player_url,
1019 }
0b65e5d4
PH
1020 if itag in self._formats:
1021 dct.update(self._formats[itag])
dd27fd17
PH
1022 formats.append(dct)
1023 return formats
1024
c5e8d7af
PH
1025 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1026 self.report_rtmp_download()
dd27fd17
PH
1027 formats = [{
1028 'format_id': '_rtmp',
1029 'protocol': 'rtmp',
1030 'url': video_info['conn'][0],
1031 'player_url': player_url,
1032 }]
24270b03 1033 elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
5f6a1245 1034 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 1035 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1036 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1037 url_map = {}
00fe14fc 1038 for url_data_str in encoded_url_map.split(','):
c5e8d7af 1039 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
1040 if 'itag' not in url_data or 'url' not in url_data:
1041 continue
1042 format_id = url_data['itag'][0]
1043 url = url_data['url'][0]
1044
1045 if 'sig' in url_data:
1046 url += '&signature=' + url_data['sig'][0]
1047 elif 's' in url_data:
1048 encrypted_sig = url_data['s'][0]
6449cd80 1049 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
201e9eaa 1050
beb95e77 1051 jsplayer_url_json = self._search_regex(
6449cd80
PH
1052 ASSETS_RE,
1053 embed_webpage if age_gate else video_webpage,
1054 'JS player URL (1)', default=None)
1055 if not jsplayer_url_json and not age_gate:
1056 # We need the embed website after all
1057 if embed_webpage is None:
1058 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1059 embed_webpage = self._download_webpage(
1060 embed_url, video_id, 'Downloading embed webpage')
1061 jsplayer_url_json = self._search_regex(
1062 ASSETS_RE, embed_webpage, 'JS player URL')
1063
beb95e77 1064 player_url = json.loads(jsplayer_url_json)
201e9eaa
PH
1065 if player_url is None:
1066 player_url_json = self._search_regex(
1067 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 1068 video_webpage, 'age gate player URL')
201e9eaa
PH
1069 player_url = json.loads(player_url_json)
1070
1071 if self._downloader.params.get('verbose'):
cf010131 1072 if player_url is None:
201e9eaa
PH
1073 player_version = 'unknown'
1074 player_desc = 'unknown'
1075 else:
1076 if player_url.endswith('swf'):
1077 player_version = self._search_regex(
1078 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 1079 'flash player', fatal=False)
201e9eaa 1080 player_desc = 'flash player %s' % player_version
cf010131 1081 else:
201e9eaa
PH
1082 player_version = self._search_regex(
1083 r'html5player-([^/]+?)(?:/html5player)?\.js',
1084 player_url,
1085 'html5 player', fatal=False)
78caa52a 1086 player_desc = 'html5 player %s' % player_version
201e9eaa 1087
60064c53 1088 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 1089 self.to_screen('{%s} signature length %s, %s' %
9e1a5b84 1090 (format_id, parts_sizes, player_desc))
201e9eaa
PH
1091
1092 signature = self._decrypt_signature(
1093 encrypted_sig, video_id, player_url, age_gate)
1094 url += '&signature=' + signature
1095 if 'ratebypass' not in url:
1096 url += '&ratebypass=yes'
1097 url_map[format_id] = url
dd27fd17 1098 formats = _map_to_format_list(url_map)
1d043b93
JMF
1099 elif video_info.get('hlsvp'):
1100 manifest_url = video_info['hlsvp'][0]
1101 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1102 formats = _map_to_format_list(url_map)
c5e8d7af 1103 else:
69ea8ca4 1104 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1105
dd27fd17 1106 # Look for the DASH manifest
203fb43f 1107 if self._downloader.params.get('youtube_include_dash_manifest', True):
774e208f 1108 dash_mpd = video_info.get('dashmpd')
75111274 1109 if dash_mpd:
774e208f
PH
1110 dash_manifest_url = dash_mpd[0]
1111 try:
1112 dash_formats = self._parse_dash_manifest(
da276600 1113 video_id, dash_manifest_url, player_url, age_gate)
774e208f
PH
1114 except (ExtractorError, KeyError) as e:
1115 self.report_warning(
1116 'Skipping DASH manifest: %r' % e, video_id)
1117 else:
e65566a9
PH
1118 # Hide the formats we found through non-DASH
1119 dash_keys = set(df['format_id'] for df in dash_formats)
1120 for f in formats:
1121 if f['format_id'] in dash_keys:
1122 f['format_id'] = 'nondash-%s' % f['format_id']
ee61f6f3 1123 f['preference'] = f.get('preference', 0) - 10000
774e208f 1124 formats.extend(dash_formats)
d80044c2 1125
6271f1ca
PH
1126 # Check for malformed aspect ratio
1127 stretched_m = re.search(
1128 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
1129 video_webpage)
1130 if stretched_m:
1131 ratio = float(stretched_m.group('w')) / float(stretched_m.group('h'))
1132 for f in formats:
1133 if f.get('vcodec') != 'none':
1134 f['stretched_ratio'] = ratio
1135
4bcc7bd1 1136 self._sort_formats(formats)
4ea3be0a 1137
1138 return {
8bcc8756
JW
1139 'id': video_id,
1140 'uploader': video_uploader,
1141 'uploader_id': video_uploader_id,
1142 'upload_date': upload_date,
1143 'title': video_title,
1144 'thumbnail': video_thumbnail,
1145 'description': video_description,
1146 'categories': video_categories,
1147 'subtitles': video_subtitles,
360e1ca5 1148 'automatic_captions': automatic_captions,
8bcc8756
JW
1149 'duration': video_duration,
1150 'age_limit': 18 if age_gate else 0,
1151 'annotations': video_annotations,
7e8c0af0 1152 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 1153 'view_count': view_count,
4ea3be0a 1154 'like_count': like_count,
1155 'dislike_count': dislike_count,
2d30521a 1156 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
8bcc8756 1157 'formats': formats,
4ea3be0a 1158 }
c5e8d7af 1159
5f6a1245 1160
880e1c52 1161class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
78caa52a 1162 IE_DESC = 'YouTube.com playlists'
d67cc9fa 1163 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1164 (?:https?://)?
1165 (?:\w+\.)?
1166 youtube\.com/
1167 (?:
ac7553d0 1168 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
c5e8d7af
PH
1169 \? (?:.*?&)*? (?:p|a|list)=
1170 | p/
1171 )
d67cc9fa 1172 (
99209c29 1173 (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,}
5f6a1245 1174 # Top tracks, they can also include dots
d67cc9fa
JMF
1175 |(?:MC)[\w\.]*
1176 )
c5e8d7af
PH
1177 .*
1178 |
99209c29 1179 ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})
c5e8d7af 1180 )"""
dbb94fb0 1181 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dbb94fb0 1182 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
78caa52a 1183 IE_NAME = 'youtube:playlist'
81127aa5
PH
1184 _TESTS = [{
1185 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1186 'info_dict': {
1187 'title': 'ytdl test PL',
a1cf99d0 1188 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
81127aa5
PH
1189 },
1190 'playlist_count': 3,
9291475f
PH
1191 }, {
1192 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1193 'info_dict': {
acf757f4 1194 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
9291475f
PH
1195 'title': 'YDL_Empty_List',
1196 },
1197 'playlist_count': 0,
1198 }, {
1199 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1200 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1201 'info_dict': {
1202 'title': '29C3: Not my department',
acf757f4 1203 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
9291475f
PH
1204 },
1205 'playlist_count': 95,
1206 }, {
1207 'note': 'issue #673',
1208 'url': 'PLBB231211A4F62143',
1209 'info_dict': {
f46a8702 1210 'title': '[OLD]Team Fortress 2 (Class-based LP)',
acf757f4 1211 'id': 'PLBB231211A4F62143',
9291475f
PH
1212 },
1213 'playlist_mincount': 26,
1214 }, {
1215 'note': 'Large playlist',
1216 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1217 'info_dict': {
1218 'title': 'Uploads from Cauchemar',
acf757f4 1219 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
9291475f
PH
1220 },
1221 'playlist_mincount': 799,
1222 }, {
1223 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1224 'info_dict': {
1225 'title': 'YDL_safe_search',
acf757f4 1226 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
9291475f
PH
1227 },
1228 'playlist_count': 2,
ac7553d0
PH
1229 }, {
1230 'note': 'embedded',
1231 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1232 'playlist_count': 4,
1233 'info_dict': {
1234 'title': 'JODA15',
acf757f4 1235 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
ac7553d0 1236 }
6b08cdf6
PH
1237 }, {
1238 'note': 'Embedded SWF player',
1239 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1240 'playlist_count': 4,
1241 'info_dict': {
1242 'title': 'JODA7',
acf757f4 1243 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
6b08cdf6 1244 }
4b7df0d3
JMF
1245 }, {
1246 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
1247 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
1248 'info_dict': {
acf757f4
PH
1249 'title': 'Uploads from Interstellar Movie',
1250 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
4b7df0d3
JMF
1251 },
1252 'playlist_mincout': 21,
81127aa5 1253 }]
c5e8d7af 1254
880e1c52
JMF
1255 def _real_initialize(self):
1256 self._login()
1257
652cdaa2 1258 def _ids_to_results(self, ids):
c9cc0bf5
PH
1259 return [
1260 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1261 for vid_id in ids]
652cdaa2
JMF
1262
1263 def _extract_mix(self, playlist_id):
99209c29 1264 # The mixes are generated from a single video
652cdaa2 1265 # the id of the playlist is just 'RD' + video_id
7d4afc55 1266 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
c9cc0bf5 1267 webpage = self._download_webpage(
78caa52a 1268 url, playlist_id, 'Downloading Youtube mix')
bc2f773b 1269 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
1270 title_span = (
1271 search_title('playlist-title') or
1272 search_title('title long-title') or
1273 search_title('title'))
76d1700b 1274 title = clean_html(title_span)
c9cc0bf5
PH
1275 ids = orderedSet(re.findall(
1276 r'''(?xs)data-video-username=".*?".*?
1277 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1278 webpage))
652cdaa2
JMF
1279 url_results = self._ids_to_results(ids)
1280
1281 return self.playlist_result(url_results, playlist_id, title)
1282
448830ce 1283 def _extract_playlist(self, playlist_id):
dbb94fb0
S
1284 url = self._TEMPLATE_URL % playlist_id
1285 page = self._download_webpage(url, playlist_id)
1286 more_widget_html = content_html = page
1287
10c0e2d8 1288 # Check if the playlist exists or is private
e399853d 1289 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
10c0e2d8 1290 raise ExtractorError(
78caa52a 1291 'The playlist doesn\'t exist or is private, use --username or '
10c0e2d8
JMF
1292 '--netrc to access it.',
1293 expected=True)
1294
dcbb4580
JMF
1295 # Extract the video ids from the playlist pages
1296 ids = []
c5e8d7af 1297
755eb032 1298 for page_num in itertools.count(1):
dbb94fb0 1299 matches = re.finditer(self._VIDEO_RE, content_html)
6e47b51e
JMF
1300 # We remove the duplicates and the link with index 0
1301 # (it's not the first video of the playlist)
1302 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1303 ids.extend(new_ids)
c5e8d7af 1304
dbb94fb0
S
1305 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1306 if not mobj:
c5e8d7af
PH
1307 break
1308
dbb94fb0 1309 more = self._download_json(
5912c639
PH
1310 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1311 'Downloading page #%s' % page_num,
1312 transform_source=uppercase_escape)
dbb94fb0 1313 content_html = more['content_html']
4b7df0d3
JMF
1314 if not content_html.strip():
1315 # Some webpages show a "Load more" button but they don't
1316 # have more videos
1317 break
dbb94fb0
S
1318 more_widget_html = more['load_more_widget_html']
1319
1320 playlist_title = self._html_search_regex(
68eb8e90 1321 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
78caa52a 1322 page, 'title')
c5e8d7af 1323
652cdaa2 1324 url_results = self._ids_to_results(ids)
dcbb4580 1325 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af 1326
448830ce
S
1327 def _real_extract(self, url):
1328 # Extract playlist id
1329 mobj = re.match(self._VALID_URL, url)
1330 if mobj is None:
1331 raise ExtractorError('Invalid URL: %s' % url)
1332 playlist_id = mobj.group(1) or mobj.group(2)
1333
1334 # Check if it's a video-specific URL
1335 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1336 if 'v' in query_dict:
1337 video_id = query_dict['v'][0]
1338 if self._downloader.params.get('noplaylist'):
1339 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1340 return self.url_result(video_id, 'Youtube', video_id=video_id)
1341 else:
1342 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1343
1344 if playlist_id.startswith('RD') or playlist_id.startswith('UL'):
1345 # Mixes require a custom extraction process
1346 return self._extract_mix(playlist_id)
1347
1348 return self._extract_playlist(playlist_id)
1349
c5e8d7af
PH
1350
1351class YoutubeChannelIE(InfoExtractor):
78caa52a 1352 IE_DESC = 'YouTube.com channels'
9ff67727 1353 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
78caa52a 1354 IE_NAME = 'youtube:channel'
cdc628a4
PH
1355 _TESTS = [{
1356 'note': 'paginated channel',
1357 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1358 'playlist_mincount': 91,
acf757f4
PH
1359 'info_dict': {
1360 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
1361 }
cdc628a4 1362 }]
c5e8d7af
PH
1363
1364 def extract_videos_from_page(self, page):
1365 ids_in_page = []
1366 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1367 if mobj.group(1) not in ids_in_page:
1368 ids_in_page.append(mobj.group(1))
1369 return ids_in_page
1370
1371 def _real_extract(self, url):
9ff67727 1372 channel_id = self._match_id(url)
c5e8d7af 1373
c5e8d7af 1374 video_ids = []
b9643eed
JMF
1375 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1376 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1377 autogenerated = re.search(r'''(?x)
1378 class="[^"]*?(?:
1379 channel-header-autogenerated-label|
1380 yt-channel-title-autogenerated
1381 )[^"]*"''', channel_page) is not None
c5e8d7af 1382
b9643eed
JMF
1383 if autogenerated:
1384 # The videos are contained in a single page
1385 # the ajax pages can't be used, they are empty
1386 video_ids = self.extract_videos_from_page(channel_page)
b82f815f
PH
1387 entries = [
1388 self.url_result(video_id, 'Youtube', video_id=video_id)
1389 for video_id in video_ids]
1390 return self.playlist_result(entries, channel_id)
1391
1392 def _entries():
23d3608c 1393 more_widget_html = content_html = channel_page
b9643eed 1394 for pagenum in itertools.count(1):
81c2f20b 1395
23d3608c 1396 ids_in_page = self.extract_videos_from_page(content_html)
b82f815f
PH
1397 for video_id in ids_in_page:
1398 yield self.url_result(
1399 video_id, 'Youtube', video_id=video_id)
5f6a1245 1400
23d3608c
JMF
1401 mobj = re.search(
1402 r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
1403 more_widget_html)
1404 if not mobj:
b9643eed 1405 break
c5e8d7af 1406
23d3608c
JMF
1407 more = self._download_json(
1408 'https://youtube.com/%s' % mobj.group('more'), channel_id,
1409 'Downloading page #%s' % (pagenum + 1),
1410 transform_source=uppercase_escape)
1411 content_html = more['content_html']
1412 more_widget_html = more['load_more_widget_html']
1413
b82f815f 1414 return self.playlist_result(_entries(), channel_id)
c5e8d7af
PH
1415
1416
1417class YoutubeUserIE(InfoExtractor):
78caa52a 1418 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
9ff67727 1419 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
38c2e5b8 1420 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
c5e8d7af 1421 _GDATA_PAGE_SIZE = 50
38c2e5b8 1422 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
78caa52a 1423 IE_NAME = 'youtube:user'
c5e8d7af 1424
cdc628a4
PH
1425 _TESTS = [{
1426 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1427 'playlist_mincount': 320,
1428 'info_dict': {
1429 'title': 'TheLinuxFoundation',
1430 }
1431 }, {
1432 'url': 'ytuser:phihag',
1433 'only_matching': True,
1434 }]
1435
e3ea4790 1436 @classmethod
f4b05232 1437 def suitable(cls, url):
e3ea4790
JMF
1438 # Don't return True if the url can be extracted with other youtube
1439 # extractor, the regex would is too permissive and it would match.
1440 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
5f6a1245
JW
1441 if any(ie.suitable(url) for ie in other_ies):
1442 return False
1443 else:
1444 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 1445
c5e8d7af 1446 def _real_extract(self, url):
9ff67727 1447 username = self._match_id(url)
c5e8d7af
PH
1448
1449 # Download video ids using YouTube Data API. Result size per
1450 # query is limited (currently to 50 videos) so we need to query
1451 # page by page until there are no video ids - it means we got
1452 # all of them.
1453
b7ab0590 1454 def download_page(pagenum):
c5e8d7af
PH
1455 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1456
1457 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1458 page = self._download_webpage(
1459 gdata_url, username,
78caa52a 1460 'Downloading video ids from %d to %d' % (
b7ab0590 1461 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1462
fd9cf738
JMF
1463 try:
1464 response = json.loads(page)
1465 except ValueError as err:
69ea8ca4 1466 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
71c82637 1467 if 'entry' not in response['feed']:
b7ab0590 1468 return
fd9cf738 1469
c5e8d7af 1470 # Extract video identifiers
e302f9ce
PH
1471 entries = response['feed']['entry']
1472 for entry in entries:
1473 title = entry['title']['$t']
1474 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1475 yield {
e302f9ce
PH
1476 '_type': 'url',
1477 'url': video_id,
1478 'ie_key': 'Youtube',
b11cec41 1479 'id': video_id,
e302f9ce 1480 'title': title,
b7ab0590 1481 }
9c44d242 1482 url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1483
7012b23c
PH
1484 return self.playlist_result(url_results, playlist_title=username)
1485
b05654f0
PH
1486
1487class YoutubeSearchIE(SearchInfoExtractor):
78caa52a
PH
1488 IE_DESC = 'YouTube.com searches'
1489 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
b05654f0 1490 _MAX_RESULTS = 1000
78caa52a 1491 IE_NAME = 'youtube:search'
b05654f0
PH
1492 _SEARCH_KEY = 'ytsearch'
1493
b05654f0
PH
1494 def _get_n_results(self, query, n):
1495 """Get a specified number of results for a query"""
1496
1497 video_ids = []
1498 pagenum = 0
1499 limit = n
83d548ef 1500 PAGE_SIZE = 50
b05654f0 1501
83d548ef
PH
1502 while (PAGE_SIZE * pagenum) < limit:
1503 result_url = self._API_URL % (
1504 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1505 (PAGE_SIZE * pagenum) + 1)
7cc3570e 1506 data_json = self._download_webpage(
69ea8ca4
PH
1507 result_url, video_id='query "%s"' % query,
1508 note='Downloading page %s' % (pagenum + 1),
1509 errnote='Unable to download API page')
7cc3570e
PH
1510 data = json.loads(data_json)
1511 api_response = data['data']
1512
1513 if 'items' not in api_response:
07ad22b8 1514 raise ExtractorError(
78caa52a 1515 '[youtube] No video results', expected=True)
b05654f0
PH
1516
1517 new_ids = list(video['id'] for video in api_response['items'])
1518 video_ids += new_ids
1519
1520 limit = min(n, api_response['totalItems'])
1521 pagenum += 1
1522
1523 if len(video_ids) > n:
1524 video_ids = video_ids[:n]
7012b23c
PH
1525 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1526 for video_id in video_ids]
b05654f0 1527 return self.playlist_result(videos, query)
75dff0ee 1528
c9ae7b95 1529
a3dd9248 1530class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1531 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1532 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1533 _SEARCH_KEY = 'ytsearchdate'
78caa52a 1534 IE_DESC = 'YouTube.com searches, newest videos first'
75dff0ee 1535
c9ae7b95
PH
1536
1537class YoutubeSearchURLIE(InfoExtractor):
78caa52a
PH
1538 IE_DESC = 'YouTube.com search URLs'
1539 IE_NAME = 'youtube:search_url'
c9ae7b95 1540 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
1541 _TESTS = [{
1542 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1543 'playlist_mincount': 5,
1544 'info_dict': {
1545 'title': 'youtube-dl test video',
1546 }
1547 }]
c9ae7b95
PH
1548
1549 def _real_extract(self, url):
1550 mobj = re.match(self._VALID_URL, url)
1551 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1552
1553 webpage = self._download_webpage(url, query)
1554 result_code = self._search_regex(
98998cde 1555 r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML')
c9ae7b95
PH
1556
1557 part_codes = re.findall(
1558 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1559 entries = []
1560 for part_code in part_codes:
1561 part_title = self._html_search_regex(
6feb2d5e 1562 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1563 part_url_snippet = self._html_search_regex(
1564 r'(?s)href="([^"]+)"', part_code, 'item URL')
1565 part_url = compat_urlparse.urljoin(
1566 'https://www.youtube.com/', part_url_snippet)
1567 entries.append({
1568 '_type': 'url',
1569 'url': part_url,
1570 'title': part_title,
1571 })
1572
1573 return {
1574 '_type': 'playlist',
1575 'entries': entries,
1576 'title': query,
1577 }
1578
1579
75dff0ee 1580class YoutubeShowIE(InfoExtractor):
78caa52a 1581 IE_DESC = 'YouTube.com (multi-season) shows'
cdc628a4 1582 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 1583 IE_NAME = 'youtube:show'
cdc628a4
PH
1584 _TESTS = [{
1585 'url': 'http://www.youtube.com/show/airdisasters',
1586 'playlist_mincount': 3,
1587 'info_dict': {
1588 'id': 'airdisasters',
1589 'title': 'Air Disasters',
1590 }
1591 }]
75dff0ee
JMF
1592
1593 def _real_extract(self, url):
1594 mobj = re.match(self._VALID_URL, url)
cdc628a4
PH
1595 playlist_id = mobj.group('id')
1596 webpage = self._download_webpage(
1597 url, playlist_id, 'Downloading show webpage')
75dff0ee
JMF
1598 # There's one playlist for each season of the show
1599 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
cdc628a4
PH
1600 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1601 entries = [
1602 self.url_result(
1603 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1604 for season in m_seasons
1605 ]
1606 title = self._og_search_title(webpage, fatal=False)
1607
1608 return {
1609 '_type': 'playlist',
1610 'id': playlist_id,
1611 'title': title,
1612 'entries': entries,
1613 }
04cc9617
JMF
1614
1615
b2e8bc1b 1616class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1617 """
1618 Base class for extractors that fetch info from
1619 http://www.youtube.com/feed_ajax
1620 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1621 """
b2e8bc1b 1622 _LOGIN_REQUIRED = True
43ba5456
JMF
1623 # use action_load_personal_feed instead of action_load_system_feed
1624 _PERSONAL_FEED = False
04cc9617 1625
d7ae0639
JMF
1626 @property
1627 def _FEED_TEMPLATE(self):
43ba5456
JMF
1628 action = 'action_load_system_feed'
1629 if self._PERSONAL_FEED:
1630 action = 'action_load_personal_feed'
38c2e5b8 1631 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1632
1633 @property
1634 def IE_NAME(self):
78caa52a 1635 return 'youtube:%s' % self._FEED_NAME
04cc9617 1636
81f0259b 1637 def _real_initialize(self):
b2e8bc1b 1638 self._login()
81f0259b 1639
04cc9617
JMF
1640 def _real_extract(self, url):
1641 feed_entries = []
0e44d838
JMF
1642 paging = 0
1643 for i in itertools.count(1):
84d84211
PH
1644 info = self._download_json(
1645 self._FEED_TEMPLATE % paging,
1646 '%s feed' % self._FEED_NAME,
1647 'Downloading page %s' % i,
1648 transform_source=uppercase_escape)
f6177462 1649 feed_html = info.get('feed_html') or info.get('content_html')
1a9b9649 1650 load_more_widget_html = info.get('load_more_widget_html') or feed_html
43ba5456 1651 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1652 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1653 feed_entries.extend(
1654 self.url_result(video_id, 'Youtube', video_id=video_id)
1655 for video_id in ids)
05ee2b6d
JMF
1656 mobj = re.search(
1657 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1a9b9649 1658 load_more_widget_html)
05ee2b6d 1659 if mobj is None:
04cc9617 1660 break
05ee2b6d 1661 paging = mobj.group('paging')
d7ae0639
JMF
1662 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1663
5f6a1245 1664
d7ae0639 1665class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
7e17ec8c 1666 IE_NAME = 'youtube:recommended'
f3a34072 1667 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
d7ae0639
JMF
1668 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1669 _FEED_NAME = 'recommended'
78caa52a 1670 _PLAYLIST_TITLE = 'Youtube Recommended videos'
c626a3d9 1671
5f6a1245 1672
448830ce 1673class YoutubeWatchLaterIE(YoutubePlaylistIE):
7e17ec8c 1674 IE_NAME = 'youtube:watchlater'
f3a34072 1675 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
448830ce 1676 _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'
157e9e5a
JMF
1677
1678 _TESTS = [] # override PlaylistIE tests
c626a3d9 1679
448830ce
S
1680 def _real_extract(self, url):
1681 return self._extract_playlist('WL')
1682
5f6a1245 1683
f459d170 1684class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
7e17ec8c 1685 IE_NAME = 'youtube:history'
f3a34072 1686 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
78caa52a 1687 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
f459d170
JMF
1688 _FEED_NAME = 'history'
1689 _PERSONAL_FEED = True
78caa52a 1690 _PLAYLIST_TITLE = 'Youtube Watch History'
f459d170 1691
5f6a1245 1692
c626a3d9 1693class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 1694 IE_NAME = 'youtube:favorites'
f3a34072 1695 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
c7a7750d 1696 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1697 _LOGIN_REQUIRED = True
1698
1699 def _real_extract(self, url):
1700 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 1701 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 1702 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1703
1704
1ed5b5c9 1705class YoutubeSubscriptionsIE(YoutubePlaylistIE):
78caa52a
PH
1706 IE_NAME = 'youtube:subscriptions'
1707 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1ed5b5c9 1708 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
81127aa5 1709 _TESTS = []
1ed5b5c9
JMF
1710
1711 def _real_extract(self, url):
78caa52a 1712 title = 'Youtube Subscriptions'
1ed5b5c9
JMF
1713 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1714
1715 # The extraction process is the same as for playlists, but the regex
1716 # for the video ids doesn't contain an index
1717 ids = []
1718 more_widget_html = content_html = page
1719
1720 for page_num in itertools.count(1):
1721 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1722 new_ids = orderedSet(matches)
1723 ids.extend(new_ids)
1724
1725 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1726 if not mobj:
1727 break
1728
1729 more = self._download_json(
1730 'https://youtube.com/%s' % mobj.group('more'), title,
1731 'Downloading page #%s' % page_num,
1732 transform_source=uppercase_escape)
1733 content_html = more['content_html']
1734 more_widget_html = more['load_more_widget_html']
1735
1736 return {
1737 '_type': 'playlist',
1738 'title': title,
1739 'entries': self._ids_to_results(ids),
1740 }
1741
1742
15870e90
PH
1743class YoutubeTruncatedURLIE(InfoExtractor):
1744 IE_NAME = 'youtube:truncated_url'
1745 IE_DESC = False # Do not list
975d35db 1746 _VALID_URL = r'''(?x)
b95aab84
PH
1747 (?:https?://)?
1748 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
1749 (?:watch\?(?:
c4808c60 1750 feature=[a-z_]+|
b95aab84
PH
1751 annotation_id=annotation_[^&]+|
1752 x-yt-cl=[0-9]+|
c1708b89 1753 hl=[^&]*|
b95aab84
PH
1754 )?
1755 |
1756 attribution_link\?a=[^&]+
1757 )
1758 $
975d35db 1759 '''
15870e90 1760
c4808c60
PH
1761 _TESTS = [{
1762 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1763 'only_matching': True,
dc2fc736
PH
1764 }, {
1765 'url': 'http://www.youtube.com/watch?',
1766 'only_matching': True,
b95aab84
PH
1767 }, {
1768 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
1769 'only_matching': True,
1770 }, {
1771 'url': 'https://www.youtube.com/watch?feature=foo',
1772 'only_matching': True,
c1708b89
PH
1773 }, {
1774 'url': 'https://www.youtube.com/watch?hl=en-GB',
1775 'only_matching': True,
c4808c60
PH
1776 }]
1777
15870e90
PH
1778 def _real_extract(self, url):
1779 raise ExtractorError(
78caa52a
PH
1780 'Did you forget to quote the URL? Remember that & is a meta '
1781 'character in most shells, so you want to put the URL in quotes, '
1782 'like youtube-dl '
1783 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1784 ' or simply youtube-dl BaW_jenozKc .',
15870e90 1785 expected=True)
772fd5cc
PH
1786
1787
1788class YoutubeTruncatedIDIE(InfoExtractor):
1789 IE_NAME = 'youtube:truncated_id'
1790 IE_DESC = False # Do not list
b95aab84 1791 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
1792
1793 _TESTS = [{
1794 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
1795 'only_matching': True,
1796 }]
1797
1798 def _real_extract(self, url):
1799 video_id = self._match_id(url)
1800 raise ExtractorError(
1801 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
1802 expected=True)