]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
Credit @gebn for moviefap
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
42939b61 10import time
e0df6211 11import traceback
c5e8d7af 12
b05654f0 13from .common import InfoExtractor, SearchInfoExtractor
2b25cb5d 14from ..jsinterp import JSInterpreter
54256267 15from ..swfinterp import SWFInterpreter
4bb4a188 16from ..compat import (
edf3e38e 17 compat_chr,
c5e8d7af 18 compat_parse_qs,
c5e8d7af
PH
19 compat_urllib_parse,
20 compat_urllib_request,
7c61bd36 21 compat_urlparse,
c5e8d7af 22 compat_str,
4bb4a188
PH
23)
24from ..utils import (
c5e8d7af 25 clean_html,
c5e8d7af 26 ExtractorError,
2d30521a 27 float_or_none,
4bb4a188
PH
28 get_element_by_attribute,
29 get_element_by_id,
dd27fd17 30 int_or_none,
4bb4a188 31 orderedSet,
c5e8d7af
PH
32 unescapeHTML,
33 unified_strdate,
81c2f20b 34 uppercase_escape,
af214c3a 35 ISO3166Utils,
c5e8d7af
PH
36)
37
5f6a1245 38
de7f3446 39class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
83317f69 42 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
b2e8bc1b
JMF
43 _NETRC_MACHINE = 'youtube'
44 # If True it will raise an error if no login info is provided
45 _LOGIN_REQUIRED = False
46
b2e8bc1b 47 def _set_language(self):
810fb84d
PH
48 self._set_cookie(
49 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
42939b61 50 # YouTube sets the expire time to about two months
810fb84d 51 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b 52
25f14e9f
S
53 def _ids_to_results(self, ids):
54 return [
55 self.url_result(vid_id, 'Youtube', video_id=vid_id)
56 for vid_id in ids]
57
b2e8bc1b 58 def _login(self):
83317f69 59 """
60 Attempt to log in to YouTube.
61 True is returned if successful or skipped.
62 False is returned if login failed.
63
64 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
65 """
b2e8bc1b
JMF
66 (username, password) = self._get_login_info()
67 # No authentication to be performed
68 if username is None:
69 if self._LOGIN_REQUIRED:
69ea8ca4 70 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 71 return True
b2e8bc1b 72
7cc3570e
PH
73 login_page = self._download_webpage(
74 self._LOGIN_URL, None,
69ea8ca4
PH
75 note='Downloading login page',
76 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
77 if login_page is False:
78 return
b2e8bc1b 79
795f28f8 80 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78caa52a 81 login_page, 'Login GALX parameter')
c5e8d7af 82
b2e8bc1b
JMF
83 # Log in
84 login_form_strs = {
8bcc8756
JW
85 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
86 'Email': username,
87 'GALX': galx,
88 'Passwd': password,
89
90 'PersistentCookie': 'yes',
91 '_utf8': '霱',
92 'bgresponse': 'js_disabled',
93 'checkConnection': '',
94 'checkedDomains': 'youtube',
95 'dnConn': '',
96 'pstMsg': '0',
97 'rmShown': '1',
98 'secTok': '',
99 'signIn': 'Sign in',
100 'timeStmp': '',
101 'service': 'youtube',
102 'uilel': '3',
103 'hl': 'en_US',
b2e8bc1b 104 }
83317f69 105
b2e8bc1b
JMF
106 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
107 # chokes on unicode
5f6a1245 108 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
b2e8bc1b 109 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
110
111 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
112 login_results = self._download_webpage(
113 req, None,
69ea8ca4 114 note='Logging in', errnote='unable to log in', fatal=False)
7cc3570e
PH
115 if login_results is False:
116 return False
83317f69 117
118 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
69ea8ca4 119 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
83317f69 120
121 # Two-Factor
122 # TODO add SMS and phone call support - these require making a request and then prompting the user
123
124 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
125 tfa_code = self._get_tfa_info()
126
127 if tfa_code is None:
69ea8ca4
PH
128 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
129 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
83317f69 130 return False
131
132 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
133
134 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
135 if match is None:
69ea8ca4 136 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
83317f69 137 secTok = match.group(1)
138 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
139 if match is None:
69ea8ca4 140 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
83317f69 141 timeStmp = match.group(1)
142
143 tfa_form_strs = {
78caa52a
PH
144 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
145 'smsToken': '',
146 'smsUserPin': tfa_code,
147 'smsVerifyPin': 'Verify',
148
149 'PersistentCookie': 'yes',
150 'checkConnection': '',
151 'checkedDomains': 'youtube',
152 'pstMsg': '1',
153 'secTok': secTok,
154 'timeStmp': timeStmp,
155 'service': 'youtube',
156 'hl': 'en_US',
83317f69 157 }
5f6a1245 158 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
83317f69 159 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
160
161 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
162 tfa_results = self._download_webpage(
163 tfa_req, None,
69ea8ca4 164 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
83317f69 165
166 if tfa_results is False:
167 return False
168
169 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
69ea8ca4 170 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
83317f69 171 return False
172 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
69ea8ca4 173 self._downloader.report_warning('unable to log in - did the page structure change?')
83317f69 174 return False
175 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
69ea8ca4 176 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
83317f69 177 return False
178
7cc3570e 179 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
69ea8ca4 180 self._downloader.report_warning('unable to log in: bad username or password')
b2e8bc1b
JMF
181 return False
182 return True
183
b2e8bc1b
JMF
184 def _real_initialize(self):
185 if self._downloader is None:
186 return
42939b61 187 self._set_language()
b2e8bc1b
JMF
188 if not self._login():
189 return
c5e8d7af 190
8377574c 191
360e1ca5 192class YoutubeIE(YoutubeBaseInfoExtractor):
78caa52a 193 IE_DESC = 'YouTube.com'
cb7dfeea 194 _VALID_URL = r"""(?x)^
c5e8d7af 195 (
edb53e2d 196 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 197 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 198 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 199 (?:www\.)?pwnyoutube\.com/|
f7000f3a 200 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
201 tube\.majestyc\.net/|
202 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
203 (?:.*?\#/)? # handle anchor (#/) redirect urls
204 (?: # the various things that can precede the ID:
ac7553d0 205 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 206 |(?: # or the v= param in all its forms
f7000f3a 207 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
208 (?:\?|\#!?) # the params delimiter ? or # or #!
209 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
210 v=
211 )
f4b05232
JMF
212 ))
213 |youtu\.be/ # just youtu.be/xxxx
edb53e2d 214 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 215 )
c5e8d7af 216 )? # all until now is optional -> you can pass the naked ID
8963d9c2 217 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
9291475f 218 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
c5e8d7af
PH
219 (?(1).+)? # if we found the ID, everything can follow
220 $"""
c5e8d7af 221 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
222 _formats = {
223 '5': {'ext': 'flv', 'width': 400, 'height': 240},
224 '6': {'ext': 'flv', 'width': 450, 'height': 270},
225 '13': {'ext': '3gp'},
226 '17': {'ext': '3gp', 'width': 176, 'height': 144},
227 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
228 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
229 '34': {'ext': 'flv', 'width': 640, 'height': 360},
230 '35': {'ext': 'flv', 'width': 854, 'height': 480},
231 '36': {'ext': '3gp', 'width': 320, 'height': 240},
232 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
233 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
234 '43': {'ext': 'webm', 'width': 640, 'height': 360},
235 '44': {'ext': 'webm', 'width': 854, 'height': 480},
236 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
237 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
c9bebed2
S
238 '59': {'ext': 'mp4', 'width': 854, 'height': 480},
239 '78': {'ext': 'mp4', 'width': 854, 'height': 480},
2c62dc26 240
1d043b93 241
86fe61c8 242 # 3d videos
43b81eb9
PH
243 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
244 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
245 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
246 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
247 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
248 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
249 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 250
96fb5605 251 # Apple HTTP Live Streaming
43b81eb9
PH
252 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
253 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
254 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
255 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
256 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
257 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
258 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
259
260 # DASH mp4 video
43b81eb9
PH
261 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
262 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
263 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
264 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
265 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
e65566a9 266 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
43b81eb9
PH
267 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
268 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
0d2c1418
PH
269 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
270 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
271 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
836a086c 272
f6f1fc92 273 # Dash mp4 audio
62cd676c
PH
274 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
275 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
276 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
836a086c
AZ
277
278 # Dash webm
e75cafe9
A
279 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
280 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
281 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
282 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
283 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
284 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
1cc887cb 285 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
e75cafe9
A
286 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
287 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
288 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
289 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
290 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
291 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
292 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 293 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 294 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
a51d3aa0
PH
295 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
296 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
76b3c610 297 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
10a404c3 298 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
76b3c610 299 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
2c62dc26
PH
300
301 # Dash webm audio
55db73ef 302 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
e75cafe9 303 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d 304
0857baad
PH
305 # Dash webm audio with opus inside
306 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
307 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
308 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
309
ce6b9a2d
PH
310 # RTMP (unnamed)
311 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 312 }
836a086c 313
78caa52a 314 IE_NAME = 'youtube'
2eb88d95
PH
315 _TESTS = [
316 {
4bc3a23e
PH
317 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
318 'info_dict': {
319 'id': 'BaW_jenozKc',
320 'ext': 'mp4',
321 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
322 'uploader': 'Philipp Hagemeister',
323 'uploader_id': 'phihag',
324 'upload_date': '20121002',
325 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
326 'categories': ['Science & Technology'],
3e7c1224
PH
327 'like_count': int,
328 'dislike_count': int,
2eb88d95 329 }
0e853ca4 330 },
0e853ca4 331 {
4bc3a23e
PH
332 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
333 'note': 'Test generic use_cipher_signature video (#897)',
334 'info_dict': {
335 'id': 'UxxajLWwzqY',
336 'ext': 'mp4',
337 'upload_date': '20120506',
338 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
339 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
340 'uploader': 'Icona Pop',
341 'uploader_id': 'IconaPop',
2eb88d95 342 }
c108eb73
JMF
343 },
344 {
4bc3a23e
PH
345 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
346 'note': 'Test VEVO video with age protection (#956)',
347 'info_dict': {
348 'id': '07FYdnEawAQ',
349 'ext': 'mp4',
350 'upload_date': '20130703',
351 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
352 'description': 'md5:64249768eec3bc4276236606ea996373',
353 'uploader': 'justintimberlakeVEVO',
354 'uploader_id': 'justintimberlakeVEVO',
c108eb73
JMF
355 }
356 },
fccd3771 357 {
4bc3a23e
PH
358 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
359 'note': 'Embed-only video (#1746)',
360 'info_dict': {
361 'id': 'yZIXLfi8CZQ',
362 'ext': 'mp4',
363 'upload_date': '20120608',
364 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
365 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
366 'uploader': 'SET India',
367 'uploader_id': 'setindia'
fccd3771
PH
368 }
369 },
dd27fd17 370 {
4bc3a23e
PH
371 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
372 'note': '256k DASH audio (format 141) via DASH manifest',
373 'info_dict': {
374 'id': 'a9LDPn-MO4I',
375 'ext': 'm4a',
376 'upload_date': '20121002',
377 'uploader_id': '8KVIDEO',
378 'description': '',
379 'uploader': '8KVIDEO',
380 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 381 },
4bc3a23e
PH
382 'params': {
383 'youtube_include_dash_manifest': True,
384 'format': '141',
4919603f 385 },
dd27fd17 386 },
3489b7d2
JMF
387 # DASH manifest with encrypted signature
388 {
78caa52a
PH
389 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
390 'info_dict': {
391 'id': 'IB3lcPjvWLA',
392 'ext': 'm4a',
b766eb27
JMF
393 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
394 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
78caa52a
PH
395 'uploader': 'AfrojackVEVO',
396 'uploader_id': 'AfrojackVEVO',
397 'upload_date': '20131011',
3489b7d2 398 },
4bc3a23e 399 'params': {
78caa52a
PH
400 'youtube_include_dash_manifest': True,
401 'format': '141',
3489b7d2
JMF
402 },
403 },
aaeb86f6
S
404 # JS player signature function name containing $
405 {
406 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
407 'info_dict': {
408 'id': 'nfWlot6h_JM',
409 'ext': 'm4a',
410 'title': 'Taylor Swift - Shake It Off',
411 'description': 'md5:2acfda1b285bdd478ccec22f9918199d',
412 'uploader': 'TaylorSwiftVEVO',
413 'uploader_id': 'TaylorSwiftVEVO',
414 'upload_date': '20140818',
415 },
416 'params': {
417 'youtube_include_dash_manifest': True,
418 'format': '141',
419 },
420 },
aa79ac0c
PH
421 # Controversy video
422 {
423 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
424 'info_dict': {
425 'id': 'T4XJQO3qol8',
426 'ext': 'mp4',
427 'upload_date': '20100909',
428 'uploader': 'The Amazing Atheist',
429 'uploader_id': 'TheAmazingAtheist',
430 'title': 'Burning Everyone\'s Koran',
431 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
432 }
c522adb1
JMF
433 },
434 # Normal age-gate video (No vevo, embed allowed)
435 {
436 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
437 'info_dict': {
438 'id': 'HtVdAasjOgU',
439 'ext': 'mp4',
440 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
9ed99402 441 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
c522adb1
JMF
442 'uploader': 'The Witcher',
443 'uploader_id': 'WitcherGame',
444 'upload_date': '20140605',
445 },
446 },
fccae2b9
S
447 # Age-gate video with encrypted signature
448 {
449 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',
450 'info_dict': {
451 'id': '6kLq3WMV1nU',
452 'ext': 'mp4',
453 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
454 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
455 'uploader': 'LloydVEVO',
456 'uploader_id': 'LloydVEVO',
457 'upload_date': '20110629',
458 },
459 },
774e208f
PH
460 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
461 {
462 'url': '__2ABJjxzNo',
463 'info_dict': {
464 'id': '__2ABJjxzNo',
465 'ext': 'mp4',
466 'upload_date': '20100430',
467 'uploader_id': 'deadmau5',
468 'description': 'md5:12c56784b8032162bb936a5f76d55360',
469 'uploader': 'deadmau5',
470 'title': 'Deadmau5 - Some Chords (HD)',
471 },
472 'expected_warnings': [
473 'DASH manifest missing',
474 ]
e52a40ab
PH
475 },
476 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
477 {
478 'url': 'lqQg6PlCWgI',
479 'info_dict': {
480 'id': 'lqQg6PlCWgI',
481 'ext': 'mp4',
cbe2bd91
PH
482 'upload_date': '20120731',
483 'uploader_id': 'olympic',
484 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
485 'uploader': 'Olympics',
486 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
487 },
488 'params': {
489 'skip_download': 'requires avconv',
e52a40ab 490 }
cbe2bd91 491 },
6271f1ca
PH
492 # Non-square pixels
493 {
494 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
495 'info_dict': {
496 'id': '_b-2C3KPAM0',
497 'ext': 'mp4',
498 'stretched_ratio': 16 / 9.,
499 'upload_date': '20110310',
500 'uploader_id': 'AllenMeow',
501 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
502 'uploader': '孫艾倫',
503 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
504 },
06b491eb
S
505 },
506 # url_encoded_fmt_stream_map is empty string
507 {
508 'url': 'qEJwOuvDf7I',
509 'info_dict': {
510 'id': 'qEJwOuvDf7I',
511 'ext': 'mp4',
512 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
513 'description': '',
514 'upload_date': '20150404',
515 'uploader_id': 'spbelect',
516 'uploader': 'Наблюдатели Петербурга',
517 },
518 'params': {
519 'skip_download': 'requires avconv',
520 }
521 },
2eb88d95
PH
522 ]
523
e0df6211
PH
524 def __init__(self, *args, **kwargs):
525 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 526 self._player_cache = {}
e0df6211 527
c5e8d7af
PH
528 def report_video_info_webpage_download(self, video_id):
529 """Report attempt to download video info webpage."""
69ea8ca4 530 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 531
c5e8d7af
PH
532 def report_information_extraction(self, video_id):
533 """Report attempt to extract video information."""
69ea8ca4 534 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
535
536 def report_unavailable_format(self, video_id, format):
537 """Report extracted video URL."""
69ea8ca4 538 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
539
540 def report_rtmp_download(self):
541 """Indicate the download will use the RTMP protocol."""
69ea8ca4 542 self.to_screen('RTMP download detected')
c5e8d7af 543
60064c53
PH
544 def _signature_cache_id(self, example_sig):
545 """ Return a string representation of a signature """
78caa52a 546 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
547
548 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 549 id_m = re.match(
60620368 550 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
cf010131 551 player_url)
c081b35c
PH
552 if not id_m:
553 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
554 player_type = id_m.group('ext')
555 player_id = id_m.group('id')
556
c4417ddb 557 # Read from filesystem cache
60064c53
PH
558 func_id = '%s_%s_%s' % (
559 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 560 assert os.path.basename(func_id) == func_id
a0e07d31 561
69ea8ca4 562 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 563 if cache_spec is not None:
78caa52a 564 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 565
6d1a55a5
PH
566 download_note = (
567 'Downloading player %s' % player_url
568 if self._downloader.params.get('verbose') else
569 'Downloading %s player %s' % (player_type, player_id)
570 )
e0df6211
PH
571 if player_type == 'js':
572 code = self._download_webpage(
573 player_url, video_id,
6d1a55a5 574 note=download_note,
69ea8ca4 575 errnote='Download of %s failed' % player_url)
83799698 576 res = self._parse_sig_js(code)
c4417ddb 577 elif player_type == 'swf':
e0df6211
PH
578 urlh = self._request_webpage(
579 player_url, video_id,
6d1a55a5 580 note=download_note,
69ea8ca4 581 errnote='Download of %s failed' % player_url)
e0df6211 582 code = urlh.read()
83799698 583 res = self._parse_sig_swf(code)
e0df6211
PH
584 else:
585 assert False, 'Invalid player type %r' % player_type
586
785521bf
PH
587 test_string = ''.join(map(compat_chr, range(len(example_sig))))
588 cache_res = res(test_string)
589 cache_spec = [ord(c) for c in cache_res]
83799698 590
69ea8ca4 591 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
592 return res
593
60064c53 594 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
595 def gen_sig_code(idxs):
596 def _genslice(start, end, step):
78caa52a 597 starts = '' if start == 0 else str(start)
8bcc8756 598 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 599 steps = '' if step == 1 else (':%d' % step)
78caa52a 600 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
601
602 step = None
7af808a5
PH
603 # Quelch pyflakes warnings - start will be set when step is set
604 start = '(Never used)'
edf3e38e
PH
605 for i, prev in zip(idxs[1:], idxs[:-1]):
606 if step is not None:
607 if i - prev == step:
608 continue
609 yield _genslice(start, prev, step)
610 step = None
611 continue
612 if i - prev in [-1, 1]:
613 step = i - prev
614 start = prev
615 continue
616 else:
78caa52a 617 yield 's[%d]' % prev
edf3e38e 618 if step is None:
78caa52a 619 yield 's[%d]' % i
edf3e38e
PH
620 else:
621 yield _genslice(start, i, step)
622
78caa52a 623 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 624 cache_res = func(test_string)
edf3e38e 625 cache_spec = [ord(c) for c in cache_res]
78caa52a 626 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
627 signature_id_tuple = '(%s)' % (
628 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 629 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 630 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 631 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 632
e0df6211
PH
633 def _parse_sig_js(self, jscode):
634 funcname = self._search_regex(
aaeb86f6 635 r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode,
8bcc8756 636 'Initial JS player signature function name')
2b25cb5d
PH
637
638 jsi = JSInterpreter(jscode)
639 initial_function = jsi.extract_function(funcname)
e0df6211
PH
640 return lambda s: initial_function([s])
641
642 def _parse_sig_swf(self, file_contents):
54256267 643 swfi = SWFInterpreter(file_contents)
78caa52a 644 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 645 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 646 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
647 return lambda s: initial_function([s])
648
83799698 649 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 650 """Turn the encrypted s field into a working signature"""
6b37f0be 651
c8bf86d5 652 if player_url is None:
69ea8ca4 653 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 654
69ea8ca4 655 if player_url.startswith('//'):
78caa52a 656 player_url = 'https:' + player_url
c8bf86d5 657 try:
62af3a0e 658 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
659 if player_id not in self._player_cache:
660 func = self._extract_signature_function(
60064c53 661 video_id, player_url, s
c8bf86d5
PH
662 )
663 self._player_cache[player_id] = func
664 func = self._player_cache[player_id]
665 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 666 self._print_sig_code(func, s)
c8bf86d5
PH
667 return func(s)
668 except Exception as e:
669 tb = traceback.format_exc()
670 raise ExtractorError(
78caa52a 671 'Signature extraction failed: ' + tb, cause=e)
e0df6211 672
360e1ca5 673 def _get_subtitles(self, video_id, webpage):
de7f3446 674 try:
60e47a26 675 subs_doc = self._download_xml(
38c2e5b8 676 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
677 video_id, note=False)
678 except ExtractorError as err:
69ea8ca4 679 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
de7f3446 680 return {}
de7f3446
JMF
681
682 sub_lang_list = {}
60e47a26
JMF
683 for track in subs_doc.findall('track'):
684 lang = track.attrib['lang_code']
7e660ac1
LD
685 if lang in sub_lang_list:
686 continue
360e1ca5
JMF
687 sub_formats = []
688 for ext in ['sbv', 'vtt', 'srt']:
689 params = compat_urllib_parse.urlencode({
690 'lang': lang,
691 'v': video_id,
692 'fmt': ext,
693 'name': track.attrib['name'].encode('utf-8'),
694 })
695 sub_formats.append({
696 'url': 'https://www.youtube.com/api/timedtext?' + params,
697 'ext': ext,
698 })
699 sub_lang_list[lang] = sub_formats
de7f3446 700 if not sub_lang_list:
69ea8ca4 701 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
702 return {}
703 return sub_lang_list
704
360e1ca5 705 def _get_automatic_captions(self, video_id, webpage):
de7f3446
JMF
706 """We need the webpage for getting the captions url, pass it as an
707 argument to speed up the process."""
69ea8ca4 708 self.to_screen('%s: Looking for automatic captions' % video_id)
de7f3446 709 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
78caa52a 710 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
711 if mobj is None:
712 self._downloader.report_warning(err_msg)
713 return {}
714 player_config = json.loads(mobj.group(1))
715 try:
0792d563
PH
716 args = player_config['args']
717 caption_url = args['ttsurl']
718 timestamp = args['timestamp']
055e6f36
JMF
719 # We get the available subtitles
720 list_params = compat_urllib_parse.urlencode({
721 'type': 'list',
722 'tlangs': 1,
723 'asrs': 1,
de7f3446 724 })
055e6f36 725 list_url = caption_url + '&' + list_params
e26f8712 726 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 727 original_lang_node = caption_list.find('track')
7d900ef1 728 if original_lang_node is None:
69ea8ca4 729 self._downloader.report_warning('Video doesn\'t have automatic captions')
e3dc22ca
JMF
730 return {}
731 original_lang = original_lang_node.attrib['lang_code']
7d900ef1 732 caption_kind = original_lang_node.attrib.get('kind', '')
055e6f36
JMF
733
734 sub_lang_list = {}
735 for lang_node in caption_list.findall('target'):
736 sub_lang = lang_node.attrib['lang_code']
360e1ca5
JMF
737 sub_formats = []
738 for ext in ['sbv', 'vtt', 'srt']:
739 params = compat_urllib_parse.urlencode({
740 'lang': original_lang,
741 'tlang': sub_lang,
742 'fmt': ext,
743 'ts': timestamp,
744 'kind': caption_kind,
745 })
746 sub_formats.append({
747 'url': caption_url + '&' + params,
748 'ext': ext,
749 })
750 sub_lang_list[sub_lang] = sub_formats
055e6f36 751 return sub_lang_list
de7f3446
JMF
752 # An extractor error can be raise by the download process if there are
753 # no automatic captions but there are subtitles
754 except (KeyError, ExtractorError):
755 self._downloader.report_warning(err_msg)
756 return {}
757
97665381
PH
758 @classmethod
759 def extract_id(cls, url):
760 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 761 if mobj is None:
69ea8ca4 762 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
763 video_id = mobj.group(2)
764 return video_id
765
1d043b93
JMF
766 def _extract_from_m3u8(self, manifest_url, video_id):
767 url_map = {}
5f6a1245 768
1d043b93
JMF
769 def _get_urls(_manifest):
770 lines = _manifest.split('\n')
771 urls = filter(lambda l: l and not l.startswith('#'),
8bcc8756 772 lines)
1d043b93 773 return urls
78caa52a 774 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
1d043b93
JMF
775 formats_urls = _get_urls(manifest)
776 for format_url in formats_urls:
890f62e8 777 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
778 url_map[itag] = format_url
779 return url_map
780
1fb07d10
JG
781 def _extract_annotations(self, video_id):
782 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 783 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 784
da276600
PH
785 def _parse_dash_manifest(
786 self, video_id, dash_manifest_url, player_url, age_gate):
774e208f
PH
787 def decrypt_sig(mobj):
788 s = mobj.group(1)
789 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
790 return '/signature/%s' % dec_s
e1b9322b 791 dash_manifest_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, dash_manifest_url)
774e208f
PH
792 dash_doc = self._download_xml(
793 dash_manifest_url, video_id,
794 note='Downloading DASH manifest',
795 errnote='Could not download DASH manifest')
796
797 formats = []
de5c5456
YCH
798 for a in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}AdaptationSet'):
799 mime_type = a.attrib.get('mimeType')
800 for r in a.findall('{urn:mpeg:DASH:schema:MPD:2011}Representation'):
801 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
802 if url_el is None:
803 continue
804 if mime_type == 'text/vtt':
805 # TODO implement WebVTT downloading
806 pass
807 elif mime_type.startswith('audio/') or mime_type.startswith('video/'):
808 format_id = r.attrib['id']
809 video_url = url_el.text
810 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
811 f = {
812 'format_id': format_id,
813 'url': video_url,
814 'width': int_or_none(r.attrib.get('width')),
815 'height': int_or_none(r.attrib.get('height')),
816 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
817 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
818 'filesize': filesize,
819 'fps': int_or_none(r.attrib.get('frameRate')),
820 }
821 try:
822 existing_format = next(
823 fo for fo in formats
824 if fo['format_id'] == format_id)
825 except StopIteration:
826 full_info = self._formats.get(format_id, {}).copy()
827 full_info.update(f)
828 formats.append(full_info)
829 else:
830 existing_format.update(f)
831 else:
832 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
774e208f
PH
833 return formats
834
c5e8d7af 835 def _real_extract(self, url):
7e8c0af0 836 proto = (
78caa52a
PH
837 'http' if self._downloader.params.get('prefer_insecure', False)
838 else 'https')
7e8c0af0 839
c5e8d7af
PH
840 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
841 mobj = re.search(self._NEXT_URL_RE, url)
842 if mobj:
7e8c0af0 843 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
97665381 844 video_id = self.extract_id(url)
c5e8d7af
PH
845
846 # Get video webpage
aa79ac0c 847 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
a1f934b1 848 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
849
850 # Attempt to extract SWF player URL
e0df6211 851 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
852 if mobj is not None:
853 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
854 else:
855 player_url = None
856
857 # Get video info
6449cd80 858 embed_webpage = None
c108eb73 859 if re.search(r'player-age-gate-content">', video_webpage) is not None:
c108eb73
JMF
860 age_gate = True
861 # We simulate the access to the video from www.youtube.com/v/{video_id}
862 # this can be viewed without login into Youtube
beb95e77
CL
863 url = proto + '://www.youtube.com/embed/%s' % video_id
864 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
2c57c7fa
JMF
865 data = compat_urllib_parse.urlencode({
866 'video_id': video_id,
867 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934 868 'sts': self._search_regex(
beb95e77 869 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
2c57c7fa 870 })
7e8c0af0 871 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
94bd3613
PH
872 video_info_webpage = self._download_webpage(
873 video_info_url, video_id,
20436c30 874 note='Refetching age-gated info webpage',
94bd3613 875 errnote='unable to download video info webpage')
c5e8d7af 876 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
877 else:
878 age_gate = False
4e62ebe2
JMF
879 try:
880 # Try looking directly into the video webpage
881 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
882 if not mobj:
883 raise ValueError('Could not find ytplayer.config') # caught below
884 json_code = uppercase_escape(mobj.group(1))
885 ytplayer_config = json.loads(json_code)
886 args = ytplayer_config['args']
887 # Convert to the same format returned by compat_parse_qs
888 video_info = dict((k, [v]) for k, v in args.items())
e40bd5f0 889 if not args.get('url_encoded_fmt_stream_map'):
4e62ebe2
JMF
890 raise ValueError('No stream_map present') # caught below
891 except ValueError:
892 # We fallback to the get_video_info pages (used by the embed page)
893 self.report_video_info_webpage_download(video_id)
894 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
810fb84d
PH
895 video_info_url = (
896 '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
897 % (proto, video_id, el_type))
898 video_info_webpage = self._download_webpage(
899 video_info_url,
4e62ebe2
JMF
900 video_id, note=False,
901 errnote='unable to download video info webpage')
902 video_info = compat_parse_qs(video_info_webpage)
903 if 'token' in video_info:
904 break
c5e8d7af
PH
905 if 'token' not in video_info:
906 if 'reason' in video_info:
af214c3a
YCH
907 if 'The uploader has not made this video available in your country.' in video_info['reason']:
908 regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None)
909 if regions_allowed is not None:
910 raise ExtractorError('YouTube said: This video is available in %s only' % (
911 ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))),
912 expected=True)
d11271dd 913 raise ExtractorError(
78caa52a 914 'YouTube said: %s' % video_info['reason'][0],
d11271dd 915 expected=True, video_id=video_id)
c5e8d7af 916 else:
d11271dd 917 raise ExtractorError(
78caa52a 918 '"token" parameter not in video info for unknown reason',
d11271dd 919 video_id=video_id)
c5e8d7af 920
1d699755
PH
921 if 'view_count' in video_info:
922 view_count = int(video_info['view_count'][0])
923 else:
924 view_count = None
925
c5e8d7af
PH
926 # Check for "rental" videos
927 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
69ea8ca4 928 raise ExtractorError('"rental" videos not supported')
c5e8d7af
PH
929
930 # Start extracting information
931 self.report_information_extraction(video_id)
932
933 # uploader
934 if 'author' not in video_info:
69ea8ca4 935 raise ExtractorError('Unable to extract uploader name')
c5e8d7af
PH
936 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
937
938 # uploader_id
939 video_uploader_id = None
940 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
941 if mobj is not None:
942 video_uploader_id = mobj.group(1)
943 else:
69ea8ca4 944 self._downloader.report_warning('unable to extract uploader nickname')
c5e8d7af
PH
945
946 # title
a8c6b241 947 if 'title' in video_info:
aa92f063 948 video_title = video_info['title'][0]
a8c6b241 949 else:
69ea8ca4 950 self._downloader.report_warning('Unable to extract video title')
78caa52a 951 video_title = '_'
c5e8d7af
PH
952
953 # thumbnail image
7763b04e
JMF
954 # We try first to get a high quality image:
955 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
956 video_webpage, re.DOTALL)
957 if m_thumb is not None:
958 video_thumbnail = m_thumb.group(1)
959 elif 'thumbnail_url' not in video_info:
69ea8ca4 960 self._downloader.report_warning('unable to extract video thumbnail')
f490e77e 961 video_thumbnail = None
c5e8d7af
PH
962 else: # don't panic if we can't find it
963 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
964
965 # upload date
966 upload_date = None
ad3bc6ac 967 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
beee53de
PH
968 if mobj is None:
969 mobj = re.search(
263bd4ec 970 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
beee53de 971 video_webpage)
c5e8d7af
PH
972 if mobj is not None:
973 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
974 upload_date = unified_strdate(upload_date)
975
55f7bd2d
PH
976 m_cat_container = self._search_regex(
977 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
624dcebf 978 video_webpage, 'categories', default=None)
ec8deefc 979 if m_cat_container:
ad3bc6ac 980 category = self._html_search_regex(
01ed5c9b 981 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
982 default=None)
983 video_categories = None if category is None else [category]
984 else:
985 video_categories = None
ec8deefc 986
c5e8d7af
PH
987 # description
988 video_description = get_element_by_id("eow-description", video_webpage)
989 if video_description:
27dcce19
PH
990 video_description = re.sub(r'''(?x)
991 <a\s+
992 (?:[a-zA-Z-]+="[^"]+"\s+)*?
993 title="([^"]+)"\s+
994 (?:[a-zA-Z-]+="[^"]+"\s+)*?
995 class="yt-uix-redirect-link"\s*>
996 [^<]+
997 </a>
998 ''', r'\1', video_description)
c5e8d7af
PH
999 video_description = clean_html(video_description)
1000 else:
1001 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1002 if fd_mobj:
1003 video_description = unescapeHTML(fd_mobj.group(1))
1004 else:
78caa52a 1005 video_description = ''
c5e8d7af 1006
f30a38be 1007 def _extract_count(count_name):
46374a56 1008 count = self._search_regex(
f30a38be
JMF
1009 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
1010 video_webpage, count_name, default=None)
336c3a69
JMF
1011 if count is not None:
1012 return int(count.replace(',', ''))
1013 return None
69ea8ca4
PH
1014 like_count = _extract_count('like')
1015 dislike_count = _extract_count('dislike')
336c3a69 1016
c5e8d7af 1017 # subtitles
d82134c3 1018 video_subtitles = self.extract_subtitles(video_id, video_webpage)
360e1ca5 1019 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
c5e8d7af
PH
1020
1021 if 'length_seconds' not in video_info:
69ea8ca4 1022 self._downloader.report_warning('unable to extract video duration')
b466b702 1023 video_duration = None
c5e8d7af 1024 else:
b466b702 1025 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1026
1fb07d10
JG
1027 # annotations
1028 video_annotations = None
1029 if self._downloader.params.get('writeannotations', False):
5f6a1245 1030 video_annotations = self._extract_annotations(video_id)
1fb07d10 1031
dd27fd17
PH
1032 def _map_to_format_list(urlmap):
1033 formats = []
1034 for itag, video_real_url in urlmap.items():
1035 dct = {
1036 'format_id': itag,
1037 'url': video_real_url,
1038 'player_url': player_url,
1039 }
0b65e5d4
PH
1040 if itag in self._formats:
1041 dct.update(self._formats[itag])
dd27fd17
PH
1042 formats.append(dct)
1043 return formats
1044
c5e8d7af
PH
1045 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1046 self.report_rtmp_download()
dd27fd17
PH
1047 formats = [{
1048 'format_id': '_rtmp',
1049 'protocol': 'rtmp',
1050 'url': video_info['conn'][0],
1051 'player_url': player_url,
1052 }]
24270b03 1053 elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
5f6a1245 1054 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 1055 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1056 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1057 url_map = {}
00fe14fc 1058 for url_data_str in encoded_url_map.split(','):
c5e8d7af 1059 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
1060 if 'itag' not in url_data or 'url' not in url_data:
1061 continue
1062 format_id = url_data['itag'][0]
1063 url = url_data['url'][0]
1064
1065 if 'sig' in url_data:
1066 url += '&signature=' + url_data['sig'][0]
1067 elif 's' in url_data:
1068 encrypted_sig = url_data['s'][0]
6449cd80 1069 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
201e9eaa 1070
beb95e77 1071 jsplayer_url_json = self._search_regex(
6449cd80
PH
1072 ASSETS_RE,
1073 embed_webpage if age_gate else video_webpage,
1074 'JS player URL (1)', default=None)
1075 if not jsplayer_url_json and not age_gate:
1076 # We need the embed website after all
1077 if embed_webpage is None:
1078 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1079 embed_webpage = self._download_webpage(
1080 embed_url, video_id, 'Downloading embed webpage')
1081 jsplayer_url_json = self._search_regex(
1082 ASSETS_RE, embed_webpage, 'JS player URL')
1083
beb95e77 1084 player_url = json.loads(jsplayer_url_json)
201e9eaa
PH
1085 if player_url is None:
1086 player_url_json = self._search_regex(
1087 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 1088 video_webpage, 'age gate player URL')
201e9eaa
PH
1089 player_url = json.loads(player_url_json)
1090
1091 if self._downloader.params.get('verbose'):
cf010131 1092 if player_url is None:
201e9eaa
PH
1093 player_version = 'unknown'
1094 player_desc = 'unknown'
1095 else:
1096 if player_url.endswith('swf'):
1097 player_version = self._search_regex(
1098 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 1099 'flash player', fatal=False)
201e9eaa 1100 player_desc = 'flash player %s' % player_version
cf010131 1101 else:
201e9eaa
PH
1102 player_version = self._search_regex(
1103 r'html5player-([^/]+?)(?:/html5player)?\.js',
1104 player_url,
1105 'html5 player', fatal=False)
78caa52a 1106 player_desc = 'html5 player %s' % player_version
201e9eaa 1107
60064c53 1108 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 1109 self.to_screen('{%s} signature length %s, %s' %
9e1a5b84 1110 (format_id, parts_sizes, player_desc))
201e9eaa
PH
1111
1112 signature = self._decrypt_signature(
1113 encrypted_sig, video_id, player_url, age_gate)
1114 url += '&signature=' + signature
1115 if 'ratebypass' not in url:
1116 url += '&ratebypass=yes'
1117 url_map[format_id] = url
dd27fd17 1118 formats = _map_to_format_list(url_map)
1d043b93
JMF
1119 elif video_info.get('hlsvp'):
1120 manifest_url = video_info['hlsvp'][0]
1121 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1122 formats = _map_to_format_list(url_map)
c5e8d7af 1123 else:
69ea8ca4 1124 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1125
dd27fd17 1126 # Look for the DASH manifest
203fb43f 1127 if self._downloader.params.get('youtube_include_dash_manifest', True):
774e208f 1128 dash_mpd = video_info.get('dashmpd')
75111274 1129 if dash_mpd:
774e208f
PH
1130 dash_manifest_url = dash_mpd[0]
1131 try:
1132 dash_formats = self._parse_dash_manifest(
da276600 1133 video_id, dash_manifest_url, player_url, age_gate)
774e208f
PH
1134 except (ExtractorError, KeyError) as e:
1135 self.report_warning(
1136 'Skipping DASH manifest: %r' % e, video_id)
1137 else:
04b3b3df
JMF
1138 # Remove the formats we found through non-DASH, they
1139 # contain less info and it can be wrong, because we use
1140 # fixed values (for example the resolution). See
1141 # https://github.com/rg3/youtube-dl/issues/5774 for an
1142 # example.
e65566a9 1143 dash_keys = set(df['format_id'] for df in dash_formats)
04b3b3df 1144 formats = [f for f in formats if f['format_id'] not in dash_keys]
774e208f 1145 formats.extend(dash_formats)
d80044c2 1146
6271f1ca
PH
1147 # Check for malformed aspect ratio
1148 stretched_m = re.search(
1149 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
1150 video_webpage)
1151 if stretched_m:
1152 ratio = float(stretched_m.group('w')) / float(stretched_m.group('h'))
1153 for f in formats:
1154 if f.get('vcodec') != 'none':
1155 f['stretched_ratio'] = ratio
1156
4bcc7bd1 1157 self._sort_formats(formats)
4ea3be0a 1158
1159 return {
8bcc8756
JW
1160 'id': video_id,
1161 'uploader': video_uploader,
1162 'uploader_id': video_uploader_id,
1163 'upload_date': upload_date,
1164 'title': video_title,
1165 'thumbnail': video_thumbnail,
1166 'description': video_description,
1167 'categories': video_categories,
1168 'subtitles': video_subtitles,
360e1ca5 1169 'automatic_captions': automatic_captions,
8bcc8756
JW
1170 'duration': video_duration,
1171 'age_limit': 18 if age_gate else 0,
1172 'annotations': video_annotations,
7e8c0af0 1173 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 1174 'view_count': view_count,
4ea3be0a 1175 'like_count': like_count,
1176 'dislike_count': dislike_count,
2d30521a 1177 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
8bcc8756 1178 'formats': formats,
4ea3be0a 1179 }
c5e8d7af 1180
5f6a1245 1181
880e1c52 1182class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
78caa52a 1183 IE_DESC = 'YouTube.com playlists'
d67cc9fa 1184 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1185 (?:https?://)?
1186 (?:\w+\.)?
1187 youtube\.com/
1188 (?:
ac7553d0 1189 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
c5e8d7af
PH
1190 \? (?:.*?&)*? (?:p|a|list)=
1191 | p/
1192 )
d67cc9fa 1193 (
99209c29 1194 (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,}
5f6a1245 1195 # Top tracks, they can also include dots
d67cc9fa
JMF
1196 |(?:MC)[\w\.]*
1197 )
c5e8d7af
PH
1198 .*
1199 |
99209c29 1200 ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})
c5e8d7af 1201 )"""
dbb94fb0 1202 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dbb94fb0 1203 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
78caa52a 1204 IE_NAME = 'youtube:playlist'
81127aa5
PH
1205 _TESTS = [{
1206 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1207 'info_dict': {
1208 'title': 'ytdl test PL',
a1cf99d0 1209 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
81127aa5
PH
1210 },
1211 'playlist_count': 3,
9291475f
PH
1212 }, {
1213 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1214 'info_dict': {
acf757f4 1215 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
9291475f
PH
1216 'title': 'YDL_Empty_List',
1217 },
1218 'playlist_count': 0,
1219 }, {
1220 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1221 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1222 'info_dict': {
1223 'title': '29C3: Not my department',
acf757f4 1224 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
9291475f
PH
1225 },
1226 'playlist_count': 95,
1227 }, {
1228 'note': 'issue #673',
1229 'url': 'PLBB231211A4F62143',
1230 'info_dict': {
f46a8702 1231 'title': '[OLD]Team Fortress 2 (Class-based LP)',
acf757f4 1232 'id': 'PLBB231211A4F62143',
9291475f
PH
1233 },
1234 'playlist_mincount': 26,
1235 }, {
1236 'note': 'Large playlist',
1237 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1238 'info_dict': {
1239 'title': 'Uploads from Cauchemar',
acf757f4 1240 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
9291475f
PH
1241 },
1242 'playlist_mincount': 799,
1243 }, {
1244 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1245 'info_dict': {
1246 'title': 'YDL_safe_search',
acf757f4 1247 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
9291475f
PH
1248 },
1249 'playlist_count': 2,
ac7553d0
PH
1250 }, {
1251 'note': 'embedded',
1252 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1253 'playlist_count': 4,
1254 'info_dict': {
1255 'title': 'JODA15',
acf757f4 1256 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
ac7553d0 1257 }
6b08cdf6
PH
1258 }, {
1259 'note': 'Embedded SWF player',
1260 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1261 'playlist_count': 4,
1262 'info_dict': {
1263 'title': 'JODA7',
acf757f4 1264 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
6b08cdf6 1265 }
4b7df0d3
JMF
1266 }, {
1267 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
1268 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
1269 'info_dict': {
acf757f4
PH
1270 'title': 'Uploads from Interstellar Movie',
1271 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
4b7df0d3
JMF
1272 },
1273 'playlist_mincout': 21,
81127aa5 1274 }]
c5e8d7af 1275
880e1c52
JMF
1276 def _real_initialize(self):
1277 self._login()
1278
652cdaa2 1279 def _extract_mix(self, playlist_id):
99209c29 1280 # The mixes are generated from a single video
652cdaa2 1281 # the id of the playlist is just 'RD' + video_id
7d4afc55 1282 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
c9cc0bf5 1283 webpage = self._download_webpage(
78caa52a 1284 url, playlist_id, 'Downloading Youtube mix')
bc2f773b 1285 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
1286 title_span = (
1287 search_title('playlist-title') or
1288 search_title('title long-title') or
1289 search_title('title'))
76d1700b 1290 title = clean_html(title_span)
c9cc0bf5
PH
1291 ids = orderedSet(re.findall(
1292 r'''(?xs)data-video-username=".*?".*?
1293 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1294 webpage))
652cdaa2
JMF
1295 url_results = self._ids_to_results(ids)
1296
1297 return self.playlist_result(url_results, playlist_id, title)
1298
448830ce 1299 def _extract_playlist(self, playlist_id):
dbb94fb0
S
1300 url = self._TEMPLATE_URL % playlist_id
1301 page = self._download_webpage(url, playlist_id)
dbb94fb0 1302
39b62db1
YCH
1303 for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page):
1304 match = match.strip()
1305 # Check if the playlist exists or is private
1306 if re.match(r'[^<]*(The|This) playlist (does not exist|is private)[^<]*', match):
1307 raise ExtractorError(
1308 'The playlist doesn\'t exist or is private, use --username or '
1309 '--netrc to access it.',
1310 expected=True)
1311 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
1312 raise ExtractorError(
1313 'Invalid parameters. Maybe URL is incorrect.',
1314 expected=True)
1315 elif re.match(r'[^<]*Choose your language[^<]*', match):
1316 continue
1317 else:
1318 self.report_warning('Youtube gives an alert message: ' + match)
10c0e2d8 1319
dcbb4580 1320 # Extract the video ids from the playlist pages
70219b0f
JMF
1321 def _entries():
1322 more_widget_html = content_html = page
1323 for page_num in itertools.count(1):
1324 matches = re.finditer(self._VIDEO_RE, content_html)
1325 # We remove the duplicates and the link with index 0
1326 # (it's not the first video of the playlist)
1327 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1328 for vid_id in new_ids:
1329 yield self.url_result(vid_id, 'Youtube', video_id=vid_id)
1330
1331 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1332 if not mobj:
1333 break
1334
1335 more = self._download_json(
1336 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1337 'Downloading page #%s' % page_num,
1338 transform_source=uppercase_escape)
1339 content_html = more['content_html']
1340 if not content_html.strip():
1341 # Some webpages show a "Load more" button but they don't
1342 # have more videos
1343 break
1344 more_widget_html = more['load_more_widget_html']
dbb94fb0
S
1345
1346 playlist_title = self._html_search_regex(
68eb8e90 1347 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
78caa52a 1348 page, 'title')
c5e8d7af 1349
70219b0f 1350 return self.playlist_result(_entries(), playlist_id, playlist_title)
c5e8d7af 1351
448830ce
S
1352 def _real_extract(self, url):
1353 # Extract playlist id
1354 mobj = re.match(self._VALID_URL, url)
1355 if mobj is None:
1356 raise ExtractorError('Invalid URL: %s' % url)
1357 playlist_id = mobj.group(1) or mobj.group(2)
1358
1359 # Check if it's a video-specific URL
1360 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1361 if 'v' in query_dict:
1362 video_id = query_dict['v'][0]
1363 if self._downloader.params.get('noplaylist'):
1364 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1365 return self.url_result(video_id, 'Youtube', video_id=video_id)
1366 else:
1367 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1368
1369 if playlist_id.startswith('RD') or playlist_id.startswith('UL'):
1370 # Mixes require a custom extraction process
1371 return self._extract_mix(playlist_id)
1372
1373 return self._extract_playlist(playlist_id)
1374
c5e8d7af
PH
1375
1376class YoutubeChannelIE(InfoExtractor):
78caa52a 1377 IE_DESC = 'YouTube.com channels'
9ff67727 1378 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
eb0f3e7e 1379 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
78caa52a 1380 IE_NAME = 'youtube:channel'
cdc628a4
PH
1381 _TESTS = [{
1382 'note': 'paginated channel',
1383 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1384 'playlist_mincount': 91,
acf757f4
PH
1385 'info_dict': {
1386 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
1387 }
cdc628a4 1388 }]
c5e8d7af 1389
6de5dbaf
S
1390 @staticmethod
1391 def extract_videos_from_page(page):
c5e8d7af 1392 ids_in_page = []
fb69240c
S
1393 titles_in_page = []
1394 for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page):
1395 video_id = mobj.group('id')
1396 video_title = unescapeHTML(mobj.group('title'))
1397 try:
1398 idx = ids_in_page.index(video_id)
1399 if video_title and not titles_in_page[idx]:
1400 titles_in_page[idx] = video_title
1401 except ValueError:
1402 ids_in_page.append(video_id)
1403 titles_in_page.append(video_title)
1404 return zip(ids_in_page, titles_in_page)
c5e8d7af
PH
1405
1406 def _real_extract(self, url):
9ff67727 1407 channel_id = self._match_id(url)
c5e8d7af 1408
eb0f3e7e 1409 url = self._TEMPLATE_URL % channel_id
386bdfa6
S
1410
1411 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
1412 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
1413 # otherwise fallback on channel by page extraction
1414 channel_page = self._download_webpage(
1415 url + '?view=57', channel_id,
1416 'Downloading channel page', fatal=False)
3d8e9573
S
1417 channel_playlist_id = self._html_search_meta(
1418 'channelId', channel_page, 'channel id', default=None)
1419 if not channel_playlist_id:
1420 channel_playlist_id = self._search_regex(
1421 r'data-channel-external-id="([^"]+)"',
1422 channel_page, 'channel id', default=None)
386bdfa6
S
1423 if channel_playlist_id and channel_playlist_id.startswith('UC'):
1424 playlist_id = 'UU' + channel_playlist_id[2:]
d2a9de78
IK
1425 return self.url_result(
1426 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
386bdfa6 1427
60bf45c8 1428 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
31812a9e
PH
1429 autogenerated = re.search(r'''(?x)
1430 class="[^"]*?(?:
1431 channel-header-autogenerated-label|
1432 yt-channel-title-autogenerated
1433 )[^"]*"''', channel_page) is not None
c5e8d7af 1434
b9643eed
JMF
1435 if autogenerated:
1436 # The videos are contained in a single page
1437 # the ajax pages can't be used, they are empty
b82f815f 1438 entries = [
fb69240c
S
1439 self.url_result(
1440 video_id, 'Youtube', video_id=video_id,
1441 video_title=video_title)
8f02ad4f 1442 for video_id, video_title in self.extract_videos_from_page(channel_page)]
b82f815f
PH
1443 return self.playlist_result(entries, channel_id)
1444
1445 def _entries():
23d3608c 1446 more_widget_html = content_html = channel_page
b9643eed 1447 for pagenum in itertools.count(1):
81c2f20b 1448
8f02ad4f 1449 for video_id, video_title in self.extract_videos_from_page(content_html):
b82f815f 1450 yield self.url_result(
fb69240c
S
1451 video_id, 'Youtube', video_id=video_id,
1452 video_title=video_title)
5f6a1245 1453
23d3608c
JMF
1454 mobj = re.search(
1455 r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
1456 more_widget_html)
1457 if not mobj:
b9643eed 1458 break
c5e8d7af 1459
23d3608c
JMF
1460 more = self._download_json(
1461 'https://youtube.com/%s' % mobj.group('more'), channel_id,
1462 'Downloading page #%s' % (pagenum + 1),
1463 transform_source=uppercase_escape)
1464 content_html = more['content_html']
1465 more_widget_html = more['load_more_widget_html']
1466
b82f815f 1467 return self.playlist_result(_entries(), channel_id)
c5e8d7af
PH
1468
1469
eb0f3e7e 1470class YoutubeUserIE(YoutubeChannelIE):
78caa52a 1471 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
9ff67727 1472 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
eb0f3e7e 1473 _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos'
78caa52a 1474 IE_NAME = 'youtube:user'
c5e8d7af 1475
cdc628a4
PH
1476 _TESTS = [{
1477 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1478 'playlist_mincount': 320,
1479 'info_dict': {
1480 'title': 'TheLinuxFoundation',
1481 }
1482 }, {
1483 'url': 'ytuser:phihag',
1484 'only_matching': True,
1485 }]
1486
e3ea4790 1487 @classmethod
f4b05232 1488 def suitable(cls, url):
e3ea4790
JMF
1489 # Don't return True if the url can be extracted with other youtube
1490 # extractor, the regex would is too permissive and it would match.
1491 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
5f6a1245
JW
1492 if any(ie.suitable(url) for ie in other_ies):
1493 return False
1494 else:
1495 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 1496
b05654f0 1497
b4c08069 1498class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
78caa52a 1499 IE_DESC = 'YouTube.com searches'
b4c08069
JMF
1500 # there doesn't appear to be a real limit, for example if you search for
1501 # 'python' you get more than 8.000.000 results
1502 _MAX_RESULTS = float('inf')
78caa52a 1503 IE_NAME = 'youtube:search'
b05654f0 1504 _SEARCH_KEY = 'ytsearch'
b4c08069 1505 _EXTRA_QUERY_ARGS = {}
9dd8e46a 1506 _TESTS = []
b05654f0 1507
b05654f0
PH
1508 def _get_n_results(self, query, n):
1509 """Get a specified number of results for a query"""
1510
b4c08069 1511 videos = []
b05654f0
PH
1512 limit = n
1513
b4c08069
JMF
1514 for pagenum in itertools.count(1):
1515 url_query = {
02175a79 1516 'search_query': query.encode('utf-8'),
b4c08069
JMF
1517 'page': pagenum,
1518 'spf': 'navigate',
1519 }
1520 url_query.update(self._EXTRA_QUERY_ARGS)
1521 result_url = 'https://www.youtube.com/results?' + compat_urllib_parse.urlencode(url_query)
1522 data = self._download_json(
69ea8ca4 1523 result_url, video_id='query "%s"' % query,
b4c08069 1524 note='Downloading page %s' % pagenum,
69ea8ca4 1525 errnote='Unable to download API page')
b4c08069 1526 html_content = data[1]['body']['content']
7cc3570e 1527
b4c08069 1528 if 'class="search-message' in html_content:
07ad22b8 1529 raise ExtractorError(
78caa52a 1530 '[youtube] No video results', expected=True)
b05654f0 1531
b4c08069
JMF
1532 new_videos = self._ids_to_results(orderedSet(re.findall(
1533 r'href="/watch\?v=(.{11})', html_content)))
1534 videos += new_videos
1535 if not new_videos or len(videos) > limit:
1536 break
b05654f0 1537
b4c08069
JMF
1538 if len(videos) > n:
1539 videos = videos[:n]
b05654f0 1540 return self.playlist_result(videos, query)
75dff0ee 1541
c9ae7b95 1542
a3dd9248 1543class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1544 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248 1545 _SEARCH_KEY = 'ytsearchdate'
78caa52a 1546 IE_DESC = 'YouTube.com searches, newest videos first'
b4c08069 1547 _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
75dff0ee 1548
c9ae7b95
PH
1549
1550class YoutubeSearchURLIE(InfoExtractor):
78caa52a
PH
1551 IE_DESC = 'YouTube.com search URLs'
1552 IE_NAME = 'youtube:search_url'
c9ae7b95 1553 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
1554 _TESTS = [{
1555 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1556 'playlist_mincount': 5,
1557 'info_dict': {
1558 'title': 'youtube-dl test video',
1559 }
1560 }]
c9ae7b95
PH
1561
1562 def _real_extract(self, url):
1563 mobj = re.match(self._VALID_URL, url)
1564 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1565
1566 webpage = self._download_webpage(url, query)
1567 result_code = self._search_regex(
98998cde 1568 r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML')
c9ae7b95
PH
1569
1570 part_codes = re.findall(
1571 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1572 entries = []
1573 for part_code in part_codes:
1574 part_title = self._html_search_regex(
6feb2d5e 1575 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1576 part_url_snippet = self._html_search_regex(
1577 r'(?s)href="([^"]+)"', part_code, 'item URL')
1578 part_url = compat_urlparse.urljoin(
1579 'https://www.youtube.com/', part_url_snippet)
1580 entries.append({
1581 '_type': 'url',
1582 'url': part_url,
1583 'title': part_title,
1584 })
1585
1586 return {
1587 '_type': 'playlist',
1588 'entries': entries,
1589 'title': query,
1590 }
1591
1592
75dff0ee 1593class YoutubeShowIE(InfoExtractor):
78caa52a 1594 IE_DESC = 'YouTube.com (multi-season) shows'
cdc628a4 1595 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 1596 IE_NAME = 'youtube:show'
cdc628a4
PH
1597 _TESTS = [{
1598 'url': 'http://www.youtube.com/show/airdisasters',
1599 'playlist_mincount': 3,
1600 'info_dict': {
1601 'id': 'airdisasters',
1602 'title': 'Air Disasters',
1603 }
1604 }]
75dff0ee
JMF
1605
1606 def _real_extract(self, url):
1607 mobj = re.match(self._VALID_URL, url)
cdc628a4
PH
1608 playlist_id = mobj.group('id')
1609 webpage = self._download_webpage(
1610 url, playlist_id, 'Downloading show webpage')
75dff0ee
JMF
1611 # There's one playlist for each season of the show
1612 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
cdc628a4
PH
1613 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1614 entries = [
1615 self.url_result(
1616 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1617 for season in m_seasons
1618 ]
1619 title = self._og_search_title(webpage, fatal=False)
1620
1621 return {
1622 '_type': 'playlist',
1623 'id': playlist_id,
1624 'title': title,
1625 'entries': entries,
1626 }
04cc9617
JMF
1627
1628
b2e8bc1b 1629class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639 1630 """
25f14e9f 1631 Base class for feed extractors
d7ae0639
JMF
1632 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1633 """
b2e8bc1b 1634 _LOGIN_REQUIRED = True
d7ae0639
JMF
1635
1636 @property
1637 def IE_NAME(self):
78caa52a 1638 return 'youtube:%s' % self._FEED_NAME
04cc9617 1639
81f0259b 1640 def _real_initialize(self):
b2e8bc1b 1641 self._login()
81f0259b 1642
04cc9617 1643 def _real_extract(self, url):
25f14e9f
S
1644 page = self._download_webpage(
1645 'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)
2bc43303
JMF
1646
1647 # The extraction process is the same as for playlists, but the regex
1648 # for the video ids doesn't contain an index
1649 ids = []
1650 more_widget_html = content_html = page
2bc43303
JMF
1651 for page_num in itertools.count(1):
1652 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
62c95fd5
S
1653
1654 # 'recommended' feed has infinite 'load more' and each new portion spins
1655 # the same videos in (sometimes) slightly different order, so we'll check
1656 # for unicity and break when portion has no new videos
1657 new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))
1658 if not new_ids:
1659 break
1660
2bc43303
JMF
1661 ids.extend(new_ids)
1662
1663 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1664 if not mobj:
1665 break
1666
1667 more = self._download_json(
25f14e9f 1668 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
2bc43303
JMF
1669 'Downloading page #%s' % page_num,
1670 transform_source=uppercase_escape)
1671 content_html = more['content_html']
1672 more_widget_html = more['load_more_widget_html']
1673
25f14e9f
S
1674 return self.playlist_result(
1675 self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE)
1676
1677
1678class YoutubeWatchLaterIE(YoutubePlaylistIE):
1679 IE_NAME = 'youtube:watchlater'
1680 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
1681 _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'
1682
1683 _TESTS = [] # override PlaylistIE tests
1684
1685 def _real_extract(self, url):
1686 return self._extract_playlist('WL')
f459d170 1687
5f6a1245 1688
c626a3d9 1689class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 1690 IE_NAME = 'youtube:favorites'
f3a34072 1691 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
c7a7750d 1692 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1693 _LOGIN_REQUIRED = True
1694
1695 def _real_extract(self, url):
1696 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 1697 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 1698 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1699
1700
25f14e9f
S
1701class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1702 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
1703 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1704 _FEED_NAME = 'recommended'
1705 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1ed5b5c9 1706
1ed5b5c9 1707
25f14e9f
S
1708class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1709 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1710 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1711 _FEED_NAME = 'subscriptions'
1712 _PLAYLIST_TITLE = 'Youtube Subscriptions'
1ed5b5c9 1713
1ed5b5c9 1714
25f14e9f
S
1715class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1716 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
1717 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1718 _FEED_NAME = 'history'
1719 _PLAYLIST_TITLE = 'Youtube History'
1ed5b5c9
JMF
1720
1721
15870e90
PH
1722class YoutubeTruncatedURLIE(InfoExtractor):
1723 IE_NAME = 'youtube:truncated_url'
1724 IE_DESC = False # Do not list
975d35db 1725 _VALID_URL = r'''(?x)
b95aab84
PH
1726 (?:https?://)?
1727 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
1728 (?:watch\?(?:
c4808c60 1729 feature=[a-z_]+|
b95aab84
PH
1730 annotation_id=annotation_[^&]+|
1731 x-yt-cl=[0-9]+|
c1708b89 1732 hl=[^&]*|
b95aab84
PH
1733 )?
1734 |
1735 attribution_link\?a=[^&]+
1736 )
1737 $
975d35db 1738 '''
15870e90 1739
c4808c60
PH
1740 _TESTS = [{
1741 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1742 'only_matching': True,
dc2fc736
PH
1743 }, {
1744 'url': 'http://www.youtube.com/watch?',
1745 'only_matching': True,
b95aab84
PH
1746 }, {
1747 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
1748 'only_matching': True,
1749 }, {
1750 'url': 'https://www.youtube.com/watch?feature=foo',
1751 'only_matching': True,
c1708b89
PH
1752 }, {
1753 'url': 'https://www.youtube.com/watch?hl=en-GB',
1754 'only_matching': True,
c4808c60
PH
1755 }]
1756
15870e90
PH
1757 def _real_extract(self, url):
1758 raise ExtractorError(
78caa52a
PH
1759 'Did you forget to quote the URL? Remember that & is a meta '
1760 'character in most shells, so you want to put the URL in quotes, '
1761 'like youtube-dl '
1762 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1763 ' or simply youtube-dl BaW_jenozKc .',
15870e90 1764 expected=True)
772fd5cc
PH
1765
1766
1767class YoutubeTruncatedIDIE(InfoExtractor):
1768 IE_NAME = 'youtube:truncated_id'
1769 IE_DESC = False # Do not list
b95aab84 1770 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
1771
1772 _TESTS = [{
1773 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
1774 'only_matching': True,
1775 }]
1776
1777 def _real_extract(self, url):
1778 video_id = self._match_id(url)
1779 raise ExtractorError(
1780 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
1781 expected=True)