]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[youtube] Improve multifeed videos extraction (Closes #8536)
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
42939b61 10import time
e0df6211 11import traceback
c5e8d7af 12
b05654f0 13from .common import InfoExtractor, SearchInfoExtractor
2b25cb5d 14from ..jsinterp import JSInterpreter
54256267 15from ..swfinterp import SWFInterpreter
4bb4a188 16from ..compat import (
edf3e38e 17 compat_chr,
c5e8d7af 18 compat_parse_qs,
c5e8d7af 19 compat_urllib_parse,
7fd002c0
S
20 compat_urllib_parse_unquote,
21 compat_urllib_parse_unquote_plus,
7c80519c 22 compat_urllib_parse_urlparse,
7c61bd36 23 compat_urlparse,
c5e8d7af 24 compat_str,
4bb4a188
PH
25)
26from ..utils import (
c5e8d7af 27 clean_html,
515fc877 28 encode_dict,
9b9c5355 29 error_to_compat_str,
c5e8d7af 30 ExtractorError,
2d30521a 31 float_or_none,
4bb4a188
PH
32 get_element_by_attribute,
33 get_element_by_id,
dd27fd17 34 int_or_none,
94278f72 35 mimetype2ext,
4bb4a188 36 orderedSet,
7c80519c 37 parse_duration,
0cb58b02 38 remove_quotes,
041bc3ad 39 remove_start,
5c2266df 40 sanitized_Request,
cf7e015f 41 smuggle_url,
c93d53f5 42 str_to_int,
c5e8d7af
PH
43 unescapeHTML,
44 unified_strdate,
cf7e015f 45 unsmuggle_url,
81c2f20b 46 uppercase_escape,
af214c3a 47 ISO3166Utils,
c5e8d7af
PH
48)
49
5f6a1245 50
de7f3446 51class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
52 """Provide base functions for Youtube extractors"""
53 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
9303ce3e 54 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
b2e8bc1b
JMF
55 _NETRC_MACHINE = 'youtube'
56 # If True it will raise an error if no login info is provided
57 _LOGIN_REQUIRED = False
58
b2e8bc1b 59 def _set_language(self):
810fb84d
PH
60 self._set_cookie(
61 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
42939b61 62 # YouTube sets the expire time to about two months
810fb84d 63 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b 64
25f14e9f
S
65 def _ids_to_results(self, ids):
66 return [
67 self.url_result(vid_id, 'Youtube', video_id=vid_id)
68 for vid_id in ids]
69
b2e8bc1b 70 def _login(self):
83317f69 71 """
72 Attempt to log in to YouTube.
73 True is returned if successful or skipped.
74 False is returned if login failed.
75
76 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
77 """
b2e8bc1b
JMF
78 (username, password) = self._get_login_info()
79 # No authentication to be performed
80 if username is None:
81 if self._LOGIN_REQUIRED:
69ea8ca4 82 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 83 return True
b2e8bc1b 84
7cc3570e
PH
85 login_page = self._download_webpage(
86 self._LOGIN_URL, None,
69ea8ca4
PH
87 note='Downloading login page',
88 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
89 if login_page is False:
90 return
b2e8bc1b 91
795f28f8 92 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78caa52a 93 login_page, 'Login GALX parameter')
c5e8d7af 94
b2e8bc1b
JMF
95 # Log in
96 login_form_strs = {
8bcc8756
JW
97 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
98 'Email': username,
99 'GALX': galx,
100 'Passwd': password,
101
102 'PersistentCookie': 'yes',
103 '_utf8': '霱',
104 'bgresponse': 'js_disabled',
105 'checkConnection': '',
106 'checkedDomains': 'youtube',
107 'dnConn': '',
108 'pstMsg': '0',
109 'rmShown': '1',
110 'secTok': '',
111 'signIn': 'Sign in',
112 'timeStmp': '',
113 'service': 'youtube',
114 'uilel': '3',
115 'hl': 'en_US',
b2e8bc1b 116 }
83317f69 117
515fc877 118 login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('ascii')
7cc3570e 119
5c2266df 120 req = sanitized_Request(self._LOGIN_URL, login_data)
7cc3570e
PH
121 login_results = self._download_webpage(
122 req, None,
69ea8ca4 123 note='Logging in', errnote='unable to log in', fatal=False)
7cc3570e
PH
124 if login_results is False:
125 return False
83317f69 126
127 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
69ea8ca4 128 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
83317f69 129
130 # Two-Factor
131 # TODO add SMS and phone call support - these require making a request and then prompting the user
132
9303ce3e 133 if re.search(r'(?i)<form[^>]* id="challenge"', login_results) is not None:
041bc3ad 134 tfa_code = self._get_tfa_info('2-step verification code')
83317f69 135
041bc3ad
S
136 if not tfa_code:
137 self._downloader.report_warning(
138 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
139 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
83317f69 140 return False
141
041bc3ad
S
142 tfa_code = remove_start(tfa_code, 'G-')
143
144 tfa_form_strs = self._form_hidden_inputs('challenge', login_results)
145
146 tfa_form_strs.update({
9303ce3e 147 'Pin': tfa_code,
148 'TrustDevice': 'on',
041bc3ad
S
149 })
150
515fc877 151 tfa_data = compat_urllib_parse.urlencode(encode_dict(tfa_form_strs)).encode('ascii')
83317f69 152
5c2266df 153 tfa_req = sanitized_Request(self._TWOFACTOR_URL, tfa_data)
83317f69 154 tfa_results = self._download_webpage(
155 tfa_req, None,
69ea8ca4 156 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
83317f69 157
158 if tfa_results is False:
159 return False
160
9303ce3e 161 if re.search(r'(?i)<form[^>]* id="challenge"', tfa_results) is not None:
041bc3ad 162 self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.')
83317f69 163 return False
164 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
69ea8ca4 165 self._downloader.report_warning('unable to log in - did the page structure change?')
83317f69 166 return False
167 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
69ea8ca4 168 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
83317f69 169 return False
170
7cc3570e 171 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
69ea8ca4 172 self._downloader.report_warning('unable to log in: bad username or password')
b2e8bc1b
JMF
173 return False
174 return True
175
b2e8bc1b
JMF
176 def _real_initialize(self):
177 if self._downloader is None:
178 return
42939b61 179 self._set_language()
b2e8bc1b
JMF
180 if not self._login():
181 return
c5e8d7af 182
8377574c 183
8e7aad20 184class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
061a75ed 185 # Extract entries from page with "Load more" button
648e6a1f
S
186 def _entries(self, page, playlist_id):
187 more_widget_html = content_html = page
188 for page_num in itertools.count(1):
061a75ed
S
189 for entry in self._process_page(content_html):
190 yield entry
648e6a1f
S
191
192 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
193 if not mobj:
194 break
195
196 more = self._download_json(
197 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
198 'Downloading page #%s' % page_num,
199 transform_source=uppercase_escape)
200 content_html = more['content_html']
201 if not content_html.strip():
202 # Some webpages show a "Load more" button but they don't
203 # have more videos
204 break
205 more_widget_html = more['load_more_widget_html']
206
061a75ed
S
207
208class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
209 def _process_page(self, content):
210 for video_id, video_title in self.extract_videos_from_page(content):
211 yield self.url_result(video_id, 'Youtube', video_id, video_title)
212
648e6a1f
S
213 def extract_videos_from_page(self, page):
214 ids_in_page = []
215 titles_in_page = []
216 for mobj in re.finditer(self._VIDEO_RE, page):
217 # The link with index 0 is not the first video of the playlist (not sure if still actual)
218 if 'index' in mobj.groupdict() and mobj.group('id') == '0':
219 continue
220 video_id = mobj.group('id')
221 video_title = unescapeHTML(mobj.group('title'))
222 if video_title:
223 video_title = video_title.strip()
224 try:
225 idx = ids_in_page.index(video_id)
226 if video_title and not titles_in_page[idx]:
227 titles_in_page[idx] = video_title
228 except ValueError:
229 ids_in_page.append(video_id)
230 titles_in_page.append(video_title)
231 return zip(ids_in_page, titles_in_page)
232
233
061a75ed
S
234class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
235 def _process_page(self, content):
3ccb0655 236 for playlist_id in orderedSet(re.findall(r'href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', content)):
061a75ed
S
237 yield self.url_result(
238 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
239
0c148415
S
240 def _real_extract(self, url):
241 playlist_id = self._match_id(url)
242 webpage = self._download_webpage(url, playlist_id)
0c148415 243 title = self._og_search_title(webpage, fatal=False)
061a75ed 244 return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
0c148415
S
245
246
360e1ca5 247class YoutubeIE(YoutubeBaseInfoExtractor):
78caa52a 248 IE_DESC = 'YouTube.com'
cb7dfeea 249 _VALID_URL = r"""(?x)^
c5e8d7af 250 (
edb53e2d 251 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 252 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 253 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 254 (?:www\.)?pwnyoutube\.com/|
f7000f3a 255 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
256 tube\.majestyc\.net/|
257 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
258 (?:.*?\#/)? # handle anchor (#/) redirect urls
259 (?: # the various things that can precede the ID:
ac7553d0 260 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 261 |(?: # or the v= param in all its forms
f7000f3a 262 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af 263 (?:\?|\#!?) # the params delimiter ? or # or #!
040ac686 264 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
c5e8d7af
PH
265 v=
266 )
f4b05232 267 ))
cbaed4bb
S
268 |(?:
269 youtu\.be| # just youtu.be/xxxx
270 vid\.plus # or vid.plus/xxxx
271 )/
edb53e2d 272 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 273 )
c5e8d7af 274 )? # all until now is optional -> you can pass the naked ID
8963d9c2 275 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
9291475f 276 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
c5e8d7af
PH
277 (?(1).+)? # if we found the ID, everything can follow
278 $"""
c5e8d7af 279 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26 280 _formats = {
e1a0bfdf 281 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
282 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
283 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
284 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
285 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
286 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
287 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
288 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
3834d3e3
S
289 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
290 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
e1a0bfdf 291 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
292 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
293 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
294 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
295 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
296 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
297 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
298 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
299
300
301 # 3D videos
302 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
303 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
304 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
305 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
306 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
307 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
308 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
836a086c 309
96fb5605 310 # Apple HTTP Live Streaming
e1a0bfdf 311 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
312 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
313 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
314 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
315 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
316 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
317 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
2c62dc26
PH
318
319 # DASH mp4 video
a6c2c244
YCH
320 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
321 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
322 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
323 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
324 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
325 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
326 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
327 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
328 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40},
329 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40},
330 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
836a086c 331
f6f1fc92 332 # Dash mp4 audio
a6c2c244
YCH
333 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
334 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
335 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
836a086c
AZ
336
337 # Dash webm
a6c2c244
YCH
338 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
339 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
340 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
341 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
342 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
343 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
344 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9', 'preference': -40},
345 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
346 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
347 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
348 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
349 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
350 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
351 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
352 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
4c6b4764 353 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
a6c2c244
YCH
354 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
355 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},
356 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},
357 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},
358 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
359 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},
2c62dc26
PH
360
361 # Dash webm audio
a6c2c244
YCH
362 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
363 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d 364
0857baad 365 # Dash webm audio with opus inside
a6c2c244
YCH
366 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
367 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
368 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
0857baad 369
ce6b9a2d
PH
370 # RTMP (unnamed)
371 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 372 }
23d17e4b 373 _SUBTITLE_FORMATS = ('ttml', 'vtt')
836a086c 374
78caa52a 375 IE_NAME = 'youtube'
2eb88d95
PH
376 _TESTS = [
377 {
b67d6314 378 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
4bc3a23e
PH
379 'info_dict': {
380 'id': 'BaW_jenozKc',
381 'ext': 'mp4',
382 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
383 'uploader': 'Philipp Hagemeister',
384 'uploader_id': 'phihag',
385 'upload_date': '20121002',
386 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
387 'categories': ['Science & Technology'],
000b6b5a 388 'tags': ['youtube-dl'],
3e7c1224
PH
389 'like_count': int,
390 'dislike_count': int,
7c80519c 391 'start_time': 1,
297a564b 392 'end_time': 9,
2eb88d95 393 }
0e853ca4 394 },
0e853ca4 395 {
4bc3a23e
PH
396 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
397 'note': 'Test generic use_cipher_signature video (#897)',
398 'info_dict': {
399 'id': 'UxxajLWwzqY',
400 'ext': 'mp4',
401 'upload_date': '20120506',
402 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
0cb58b02 403 'alt_title': 'I Love It (feat. Charli XCX)',
000b6b5a
S
404 'description': 'md5:782e8651347686cba06e58f71ab51773',
405 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
406 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
407 'iconic ep', 'iconic', 'love', 'it'],
4bc3a23e
PH
408 'uploader': 'Icona Pop',
409 'uploader_id': 'IconaPop',
0cb58b02 410 'creator': 'Icona Pop',
2eb88d95 411 }
c108eb73
JMF
412 },
413 {
4bc3a23e
PH
414 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
415 'note': 'Test VEVO video with age protection (#956)',
416 'info_dict': {
417 'id': '07FYdnEawAQ',
418 'ext': 'mp4',
419 'upload_date': '20130703',
420 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
0cb58b02 421 'alt_title': 'Tunnel Vision',
4bc3a23e
PH
422 'description': 'md5:64249768eec3bc4276236606ea996373',
423 'uploader': 'justintimberlakeVEVO',
424 'uploader_id': 'justintimberlakeVEVO',
0cb58b02 425 'creator': 'Justin Timberlake',
34952f09 426 'age_limit': 18,
c108eb73
JMF
427 }
428 },
fccd3771 429 {
4bc3a23e
PH
430 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
431 'note': 'Embed-only video (#1746)',
432 'info_dict': {
433 'id': 'yZIXLfi8CZQ',
434 'ext': 'mp4',
435 'upload_date': '20120608',
436 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
437 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
438 'uploader': 'SET India',
94bfcd23
S
439 'uploader_id': 'setindia',
440 'age_limit': 18,
fccd3771
PH
441 }
442 },
11b56058 443 {
b67d6314 444 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
11b56058
PM
445 'note': 'Use the first video ID in the URL',
446 'info_dict': {
447 'id': 'BaW_jenozKc',
448 'ext': 'mp4',
449 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
450 'uploader': 'Philipp Hagemeister',
451 'uploader_id': 'phihag',
452 'upload_date': '20121002',
453 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
454 'categories': ['Science & Technology'],
455 'tags': ['youtube-dl'],
456 'like_count': int,
457 'dislike_count': int,
34a7de29
S
458 },
459 'params': {
460 'skip_download': True,
461 },
11b56058 462 },
dd27fd17 463 {
4bc3a23e
PH
464 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
465 'note': '256k DASH audio (format 141) via DASH manifest',
466 'info_dict': {
467 'id': 'a9LDPn-MO4I',
468 'ext': 'm4a',
469 'upload_date': '20121002',
470 'uploader_id': '8KVIDEO',
471 'description': '',
472 'uploader': '8KVIDEO',
473 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 474 },
4bc3a23e
PH
475 'params': {
476 'youtube_include_dash_manifest': True,
477 'format': '141',
4919603f 478 },
dd27fd17 479 },
3489b7d2
JMF
480 # DASH manifest with encrypted signature
481 {
78caa52a
PH
482 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
483 'info_dict': {
484 'id': 'IB3lcPjvWLA',
485 'ext': 'm4a',
b766eb27
JMF
486 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
487 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
78caa52a
PH
488 'uploader': 'AfrojackVEVO',
489 'uploader_id': 'AfrojackVEVO',
490 'upload_date': '20131011',
3489b7d2 491 },
4bc3a23e 492 'params': {
78caa52a
PH
493 'youtube_include_dash_manifest': True,
494 'format': '141',
3489b7d2
JMF
495 },
496 },
aaeb86f6
S
497 # JS player signature function name containing $
498 {
499 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
500 'info_dict': {
501 'id': 'nfWlot6h_JM',
502 'ext': 'm4a',
503 'title': 'Taylor Swift - Shake It Off',
0cb58b02 504 'alt_title': 'Shake It Off',
f57b7835 505 'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3',
aaeb86f6
S
506 'uploader': 'TaylorSwiftVEVO',
507 'uploader_id': 'TaylorSwiftVEVO',
508 'upload_date': '20140818',
0cb58b02 509 'creator': 'Taylor Swift',
aaeb86f6
S
510 },
511 'params': {
512 'youtube_include_dash_manifest': True,
513 'format': '141',
514 },
515 },
aa79ac0c
PH
516 # Controversy video
517 {
518 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
519 'info_dict': {
520 'id': 'T4XJQO3qol8',
521 'ext': 'mp4',
522 'upload_date': '20100909',
523 'uploader': 'The Amazing Atheist',
524 'uploader_id': 'TheAmazingAtheist',
525 'title': 'Burning Everyone\'s Koran',
526 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
527 }
c522adb1
JMF
528 },
529 # Normal age-gate video (No vevo, embed allowed)
530 {
531 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
532 'info_dict': {
533 'id': 'HtVdAasjOgU',
534 'ext': 'mp4',
535 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
9ed99402 536 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
c522adb1
JMF
537 'uploader': 'The Witcher',
538 'uploader_id': 'WitcherGame',
539 'upload_date': '20140605',
34952f09 540 'age_limit': 18,
c522adb1
JMF
541 },
542 },
fccae2b9
S
543 # Age-gate video with encrypted signature
544 {
545 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',
546 'info_dict': {
547 'id': '6kLq3WMV1nU',
548 'ext': 'mp4',
549 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
550 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
551 'uploader': 'LloydVEVO',
552 'uploader_id': 'LloydVEVO',
553 'upload_date': '20110629',
34952f09 554 'age_limit': 18,
fccae2b9
S
555 },
556 },
774e208f
PH
557 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
558 {
559 'url': '__2ABJjxzNo',
560 'info_dict': {
561 'id': '__2ABJjxzNo',
562 'ext': 'mp4',
563 'upload_date': '20100430',
564 'uploader_id': 'deadmau5',
0cb58b02 565 'creator': 'deadmau5',
774e208f
PH
566 'description': 'md5:12c56784b8032162bb936a5f76d55360',
567 'uploader': 'deadmau5',
568 'title': 'Deadmau5 - Some Chords (HD)',
0cb58b02 569 'alt_title': 'Some Chords',
774e208f
PH
570 },
571 'expected_warnings': [
572 'DASH manifest missing',
573 ]
e52a40ab
PH
574 },
575 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
576 {
577 'url': 'lqQg6PlCWgI',
578 'info_dict': {
579 'id': 'lqQg6PlCWgI',
580 'ext': 'mp4',
90227264 581 'upload_date': '20150827',
cbe2bd91
PH
582 'uploader_id': 'olympic',
583 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
584 'uploader': 'Olympics',
585 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
586 },
587 'params': {
588 'skip_download': 'requires avconv',
e52a40ab 589 }
cbe2bd91 590 },
6271f1ca
PH
591 # Non-square pixels
592 {
593 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
594 'info_dict': {
595 'id': '_b-2C3KPAM0',
596 'ext': 'mp4',
597 'stretched_ratio': 16 / 9.,
598 'upload_date': '20110310',
599 'uploader_id': 'AllenMeow',
600 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
601 'uploader': '孫艾倫',
602 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
603 },
06b491eb
S
604 },
605 # url_encoded_fmt_stream_map is empty string
606 {
607 'url': 'qEJwOuvDf7I',
608 'info_dict': {
609 'id': 'qEJwOuvDf7I',
f57b7835 610 'ext': 'webm',
06b491eb
S
611 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
612 'description': '',
613 'upload_date': '20150404',
614 'uploader_id': 'spbelect',
615 'uploader': 'Наблюдатели Петербурга',
616 },
617 'params': {
618 'skip_download': 'requires avconv',
e323cf3f
S
619 },
620 'skip': 'This live event has ended.',
06b491eb 621 },
da77d856
S
622 # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)
623 {
624 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
625 'info_dict': {
626 'id': 'FIl7x6_3R5Y',
627 'ext': 'mp4',
628 'title': 'md5:7b81415841e02ecd4313668cde88737a',
629 'description': 'md5:116377fd2963b81ec4ce64b542173306',
630 'upload_date': '20150625',
631 'uploader_id': 'dorappi2000',
632 'uploader': 'dorappi2000',
633 'formats': 'mincount:33',
634 },
2ee8f5d8 635 },
8a1a26ce
YCH
636 # DASH manifest with segment_list
637 {
638 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
639 'md5': '8ce563a1d667b599d21064e982ab9e31',
640 'info_dict': {
641 'id': 'CsmdDsKjzN8',
642 'ext': 'mp4',
17ee98e1 643 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
8a1a26ce
YCH
644 'uploader': 'Airtek',
645 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
646 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
647 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
648 },
649 'params': {
650 'youtube_include_dash_manifest': True,
651 'format': '135', # bestvideo
652 }
2ee8f5d8 653 },
cf7e015f
S
654 {
655 # Multifeed videos (multiple cameras), URL is for Main Camera
656 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
657 'info_dict': {
658 'id': 'jqWvoWXjCVs',
659 'title': 'teamPGP: Rocket League Noob Stream',
660 'description': 'md5:dc7872fb300e143831327f1bae3af010',
661 },
662 'playlist': [{
663 'info_dict': {
664 'id': 'jqWvoWXjCVs',
665 'ext': 'mp4',
666 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
667 'description': 'md5:dc7872fb300e143831327f1bae3af010',
668 'upload_date': '20150721',
669 'uploader': 'Beer Games Beer',
670 'uploader_id': 'beergamesbeer',
671 },
672 }, {
673 'info_dict': {
674 'id': '6h8e8xoXJzg',
675 'ext': 'mp4',
676 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
677 'description': 'md5:dc7872fb300e143831327f1bae3af010',
678 'upload_date': '20150721',
679 'uploader': 'Beer Games Beer',
680 'uploader_id': 'beergamesbeer',
681 },
682 }, {
683 'info_dict': {
684 'id': 'PUOgX5z9xZw',
685 'ext': 'mp4',
686 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
687 'description': 'md5:dc7872fb300e143831327f1bae3af010',
688 'upload_date': '20150721',
689 'uploader': 'Beer Games Beer',
690 'uploader_id': 'beergamesbeer',
691 },
692 }, {
693 'info_dict': {
694 'id': 'teuwxikvS5k',
695 'ext': 'mp4',
696 'title': 'teamPGP: Rocket League Noob Stream (zim)',
697 'description': 'md5:dc7872fb300e143831327f1bae3af010',
698 'upload_date': '20150721',
699 'uploader': 'Beer Games Beer',
700 'uploader_id': 'beergamesbeer',
701 },
702 }],
703 'params': {
704 'skip_download': True,
705 },
cbaed4bb
S
706 },
707 {
708 'url': 'http://vid.plus/FlRa-iH7PGw',
709 'only_matching': True,
0e49d9a6
LL
710 },
711 {
61f92af1 712 # Title with JS-like syntax "};" (see https://github.com/rg3/youtube-dl/issues/7468)
a8776b10
S
713 # Also tests cut-off URL expansion in video description (see
714 # https://github.com/rg3/youtube-dl/issues/1892,
715 # https://github.com/rg3/youtube-dl/issues/8164)
0e49d9a6
LL
716 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
717 'info_dict': {
718 'id': 'lsguqyKfVQg',
719 'ext': 'mp4',
720 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
0cb58b02 721 'alt_title': 'Dark Walk',
0e49d9a6
LL
722 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
723 'upload_date': '20151119',
724 'uploader_id': 'IronSoulElf',
725 'uploader': 'IronSoulElf',
0cb58b02 726 'creator': 'Todd Haberman, Daniel Law Heath & Aaron Kaplan',
0e49d9a6
LL
727 },
728 'params': {
729 'skip_download': True,
730 },
731 },
61f92af1
S
732 {
733 # Tags with '};' (see https://github.com/rg3/youtube-dl/issues/7468)
734 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
735 'only_matching': True,
736 },
313dfc45
LL
737 {
738 # Video with yt:stretch=17:0
739 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
740 'info_dict': {
741 'id': 'Q39EVAstoRM',
742 'ext': 'mp4',
743 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
744 'description': 'md5:ee18a25c350637c8faff806845bddee9',
745 'upload_date': '20151107',
746 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
747 'uploader': 'CH GAMER DROID',
748 },
749 'params': {
750 'skip_download': True,
751 },
752 },
040ac686
S
753 {
754 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
755 'only_matching': True,
756 }
2eb88d95
PH
757 ]
758
e0df6211
PH
759 def __init__(self, *args, **kwargs):
760 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 761 self._player_cache = {}
e0df6211 762
c5e8d7af
PH
763 def report_video_info_webpage_download(self, video_id):
764 """Report attempt to download video info webpage."""
69ea8ca4 765 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 766
c5e8d7af
PH
767 def report_information_extraction(self, video_id):
768 """Report attempt to extract video information."""
69ea8ca4 769 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
770
771 def report_unavailable_format(self, video_id, format):
772 """Report extracted video URL."""
69ea8ca4 773 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
774
775 def report_rtmp_download(self):
776 """Indicate the download will use the RTMP protocol."""
69ea8ca4 777 self.to_screen('RTMP download detected')
c5e8d7af 778
60064c53
PH
779 def _signature_cache_id(self, example_sig):
780 """ Return a string representation of a signature """
78caa52a 781 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
782
783 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 784 id_m = re.match(
50f84a9a 785 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|/base)?\.(?P<ext>[a-z]+)$',
cf010131 786 player_url)
c081b35c
PH
787 if not id_m:
788 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
789 player_type = id_m.group('ext')
790 player_id = id_m.group('id')
791
c4417ddb 792 # Read from filesystem cache
60064c53
PH
793 func_id = '%s_%s_%s' % (
794 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 795 assert os.path.basename(func_id) == func_id
a0e07d31 796
69ea8ca4 797 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 798 if cache_spec is not None:
78caa52a 799 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 800
6d1a55a5
PH
801 download_note = (
802 'Downloading player %s' % player_url
803 if self._downloader.params.get('verbose') else
804 'Downloading %s player %s' % (player_type, player_id)
805 )
e0df6211
PH
806 if player_type == 'js':
807 code = self._download_webpage(
808 player_url, video_id,
6d1a55a5 809 note=download_note,
69ea8ca4 810 errnote='Download of %s failed' % player_url)
83799698 811 res = self._parse_sig_js(code)
c4417ddb 812 elif player_type == 'swf':
e0df6211
PH
813 urlh = self._request_webpage(
814 player_url, video_id,
6d1a55a5 815 note=download_note,
69ea8ca4 816 errnote='Download of %s failed' % player_url)
e0df6211 817 code = urlh.read()
83799698 818 res = self._parse_sig_swf(code)
e0df6211
PH
819 else:
820 assert False, 'Invalid player type %r' % player_type
821
785521bf
PH
822 test_string = ''.join(map(compat_chr, range(len(example_sig))))
823 cache_res = res(test_string)
824 cache_spec = [ord(c) for c in cache_res]
83799698 825
69ea8ca4 826 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
827 return res
828
60064c53 829 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
830 def gen_sig_code(idxs):
831 def _genslice(start, end, step):
78caa52a 832 starts = '' if start == 0 else str(start)
8bcc8756 833 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 834 steps = '' if step == 1 else (':%d' % step)
78caa52a 835 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
836
837 step = None
7af808a5
PH
838 # Quelch pyflakes warnings - start will be set when step is set
839 start = '(Never used)'
edf3e38e
PH
840 for i, prev in zip(idxs[1:], idxs[:-1]):
841 if step is not None:
842 if i - prev == step:
843 continue
844 yield _genslice(start, prev, step)
845 step = None
846 continue
847 if i - prev in [-1, 1]:
848 step = i - prev
849 start = prev
850 continue
851 else:
78caa52a 852 yield 's[%d]' % prev
edf3e38e 853 if step is None:
78caa52a 854 yield 's[%d]' % i
edf3e38e
PH
855 else:
856 yield _genslice(start, i, step)
857
78caa52a 858 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 859 cache_res = func(test_string)
edf3e38e 860 cache_spec = [ord(c) for c in cache_res]
78caa52a 861 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
862 signature_id_tuple = '(%s)' % (
863 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 864 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 865 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 866 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 867
e0df6211
PH
868 def _parse_sig_js(self, jscode):
869 funcname = self._search_regex(
aaeb86f6 870 r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode,
8bcc8756 871 'Initial JS player signature function name')
2b25cb5d
PH
872
873 jsi = JSInterpreter(jscode)
874 initial_function = jsi.extract_function(funcname)
e0df6211
PH
875 return lambda s: initial_function([s])
876
877 def _parse_sig_swf(self, file_contents):
54256267 878 swfi = SWFInterpreter(file_contents)
78caa52a 879 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 880 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 881 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
882 return lambda s: initial_function([s])
883
83799698 884 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 885 """Turn the encrypted s field into a working signature"""
6b37f0be 886
c8bf86d5 887 if player_url is None:
69ea8ca4 888 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 889
69ea8ca4 890 if player_url.startswith('//'):
78caa52a 891 player_url = 'https:' + player_url
c8bf86d5 892 try:
62af3a0e 893 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
894 if player_id not in self._player_cache:
895 func = self._extract_signature_function(
60064c53 896 video_id, player_url, s
c8bf86d5
PH
897 )
898 self._player_cache[player_id] = func
899 func = self._player_cache[player_id]
900 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 901 self._print_sig_code(func, s)
c8bf86d5
PH
902 return func(s)
903 except Exception as e:
904 tb = traceback.format_exc()
905 raise ExtractorError(
78caa52a 906 'Signature extraction failed: ' + tb, cause=e)
e0df6211 907
360e1ca5 908 def _get_subtitles(self, video_id, webpage):
de7f3446 909 try:
60e47a26 910 subs_doc = self._download_xml(
38c2e5b8 911 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
912 video_id, note=False)
913 except ExtractorError as err:
9b9c5355 914 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
de7f3446 915 return {}
de7f3446
JMF
916
917 sub_lang_list = {}
60e47a26
JMF
918 for track in subs_doc.findall('track'):
919 lang = track.attrib['lang_code']
7e660ac1
LD
920 if lang in sub_lang_list:
921 continue
360e1ca5 922 sub_formats = []
23d17e4b 923 for ext in self._SUBTITLE_FORMATS:
360e1ca5
JMF
924 params = compat_urllib_parse.urlencode({
925 'lang': lang,
926 'v': video_id,
927 'fmt': ext,
928 'name': track.attrib['name'].encode('utf-8'),
929 })
930 sub_formats.append({
931 'url': 'https://www.youtube.com/api/timedtext?' + params,
932 'ext': ext,
933 })
934 sub_lang_list[lang] = sub_formats
de7f3446 935 if not sub_lang_list:
69ea8ca4 936 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
937 return {}
938 return sub_lang_list
939
a72778d3
S
940 def _get_ytplayer_config(self, video_id, webpage):
941 patterns = (
526b3b07
S
942 # User data may contain arbitrary character sequences that may affect
943 # JSON extraction with regex, e.g. when '};' is contained the second
944 # regex won't capture the whole JSON. Yet working around by trying more
945 # concrete regex first keeping in mind proper quoted string handling
946 # to be implemented in future that will replace this workaround (see
947 # https://github.com/rg3/youtube-dl/issues/7468,
948 # https://github.com/rg3/youtube-dl/pull/7599)
a72778d3
S
949 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
950 r';ytplayer\.config\s*=\s*({.+?});',
951 )
952 config = self._search_regex(
953 patterns, webpage, 'ytplayer.config', default=None)
954 if config:
955 return self._parse_json(
956 uppercase_escape(config), video_id, fatal=False)
0e49d9a6 957
360e1ca5 958 def _get_automatic_captions(self, video_id, webpage):
de7f3446
JMF
959 """We need the webpage for getting the captions url, pass it as an
960 argument to speed up the process."""
69ea8ca4 961 self.to_screen('%s: Looking for automatic captions' % video_id)
a72778d3 962 player_config = self._get_ytplayer_config(video_id, webpage)
78caa52a 963 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
a72778d3 964 if not player_config:
de7f3446
JMF
965 self._downloader.report_warning(err_msg)
966 return {}
de7f3446 967 try:
0792d563
PH
968 args = player_config['args']
969 caption_url = args['ttsurl']
51290d84
S
970 if not caption_url:
971 self._downloader.report_warning(err_msg)
582f4f83 972 return {}
0792d563 973 timestamp = args['timestamp']
055e6f36
JMF
974 # We get the available subtitles
975 list_params = compat_urllib_parse.urlencode({
976 'type': 'list',
977 'tlangs': 1,
978 'asrs': 1,
de7f3446 979 })
055e6f36 980 list_url = caption_url + '&' + list_params
e26f8712 981 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 982 original_lang_node = caption_list.find('track')
7d900ef1 983 if original_lang_node is None:
69ea8ca4 984 self._downloader.report_warning('Video doesn\'t have automatic captions')
e3dc22ca
JMF
985 return {}
986 original_lang = original_lang_node.attrib['lang_code']
7d900ef1 987 caption_kind = original_lang_node.attrib.get('kind', '')
055e6f36
JMF
988
989 sub_lang_list = {}
990 for lang_node in caption_list.findall('target'):
991 sub_lang = lang_node.attrib['lang_code']
360e1ca5 992 sub_formats = []
23d17e4b 993 for ext in self._SUBTITLE_FORMATS:
360e1ca5
JMF
994 params = compat_urllib_parse.urlencode({
995 'lang': original_lang,
996 'tlang': sub_lang,
997 'fmt': ext,
998 'ts': timestamp,
999 'kind': caption_kind,
1000 })
1001 sub_formats.append({
1002 'url': caption_url + '&' + params,
1003 'ext': ext,
1004 })
1005 sub_lang_list[sub_lang] = sub_formats
055e6f36 1006 return sub_lang_list
de7f3446
JMF
1007 # An extractor error can be raise by the download process if there are
1008 # no automatic captions but there are subtitles
1009 except (KeyError, ExtractorError):
1010 self._downloader.report_warning(err_msg)
1011 return {}
1012
97665381
PH
1013 @classmethod
1014 def extract_id(cls, url):
1015 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 1016 if mobj is None:
69ea8ca4 1017 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
1018 video_id = mobj.group(2)
1019 return video_id
1020
1d043b93
JMF
1021 def _extract_from_m3u8(self, manifest_url, video_id):
1022 url_map = {}
5f6a1245 1023
1d043b93
JMF
1024 def _get_urls(_manifest):
1025 lines = _manifest.split('\n')
1026 urls = filter(lambda l: l and not l.startswith('#'),
8bcc8756 1027 lines)
1d043b93 1028 return urls
78caa52a 1029 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
1d043b93
JMF
1030 formats_urls = _get_urls(manifest)
1031 for format_url in formats_urls:
890f62e8 1032 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
1033 url_map[itag] = format_url
1034 return url_map
1035
1fb07d10
JG
1036 def _extract_annotations(self, video_id):
1037 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 1038 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 1039
c5e8d7af 1040 def _real_extract(self, url):
cf7e015f
S
1041 url, smuggled_data = unsmuggle_url(url, {})
1042
7e8c0af0 1043 proto = (
78caa52a
PH
1044 'http' if self._downloader.params.get('prefer_insecure', False)
1045 else 'https')
7e8c0af0 1046
7c80519c 1047 start_time = None
297a564b 1048 end_time = None
7c80519c
JMF
1049 parsed_url = compat_urllib_parse_urlparse(url)
1050 for component in [parsed_url.fragment, parsed_url.query]:
1051 query = compat_parse_qs(component)
297a564b 1052 if start_time is None and 't' in query:
7c80519c 1053 start_time = parse_duration(query['t'][0])
2929fa0e
JMF
1054 if start_time is None and 'start' in query:
1055 start_time = parse_duration(query['start'][0])
297a564b
JMF
1056 if end_time is None and 'end' in query:
1057 end_time = parse_duration(query['end'][0])
7c80519c 1058
c5e8d7af
PH
1059 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1060 mobj = re.search(self._NEXT_URL_RE, url)
1061 if mobj:
7fd002c0 1062 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
97665381 1063 video_id = self.extract_id(url)
c5e8d7af
PH
1064
1065 # Get video webpage
aa79ac0c 1066 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
a1f934b1 1067 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
1068
1069 # Attempt to extract SWF player URL
e0df6211 1070 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1071 if mobj is not None:
1072 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1073 else:
1074 player_url = None
1075
d8d24a92
S
1076 dash_mpds = []
1077
1078 def add_dash_mpd(video_info):
1079 dash_mpd = video_info.get('dashmpd')
1080 if dash_mpd and dash_mpd[0] not in dash_mpds:
1081 dash_mpds.append(dash_mpd[0])
1082
c5e8d7af 1083 # Get video info
6449cd80 1084 embed_webpage = None
2fe1ff85 1085 is_live = None
c108eb73 1086 if re.search(r'player-age-gate-content">', video_webpage) is not None:
c108eb73
JMF
1087 age_gate = True
1088 # We simulate the access to the video from www.youtube.com/v/{video_id}
1089 # this can be viewed without login into Youtube
beb95e77
CL
1090 url = proto + '://www.youtube.com/embed/%s' % video_id
1091 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
2c57c7fa
JMF
1092 data = compat_urllib_parse.urlencode({
1093 'video_id': video_id,
1094 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934 1095 'sts': self._search_regex(
beb95e77 1096 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
2c57c7fa 1097 })
7e8c0af0 1098 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
94bd3613
PH
1099 video_info_webpage = self._download_webpage(
1100 video_info_url, video_id,
20436c30 1101 note='Refetching age-gated info webpage',
94bd3613 1102 errnote='unable to download video info webpage')
c5e8d7af 1103 video_info = compat_parse_qs(video_info_webpage)
d8d24a92 1104 add_dash_mpd(video_info)
c108eb73
JMF
1105 else:
1106 age_gate = False
bc93bdb5 1107 video_info = None
d8d24a92 1108 # Try looking directly into the video webpage
a72778d3
S
1109 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1110 if ytplayer_config:
4e62ebe2 1111 args = ytplayer_config['args']
d8d24a92
S
1112 if args.get('url_encoded_fmt_stream_map'):
1113 # Convert to the same format returned by compat_parse_qs
1114 video_info = dict((k, [v]) for k, v in args.items())
1115 add_dash_mpd(video_info)
2fe1ff85
JMF
1116 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1117 is_live = True
0a3cf9ad
S
1118 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1119 # We also try looking in get_video_info since it may contain different dashmpd
1120 # URL that points to a DASH manifest with possibly different itag set (some itags
1121 # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
1122 # manifest pointed by get_video_info's dashmpd).
1123 # The general idea is to take a union of itags of both DASH manifests (for example
1124 # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
4e62ebe2 1125 self.report_video_info_webpage_download(video_id)
0a3cf9ad 1126 for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:
810fb84d
PH
1127 video_info_url = (
1128 '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1129 % (proto, video_id, el_type))
1130 video_info_webpage = self._download_webpage(
1131 video_info_url,
4e62ebe2
JMF
1132 video_id, note=False,
1133 errnote='unable to download video info webpage')
0a3cf9ad 1134 get_video_info = compat_parse_qs(video_info_webpage)
87dc4511
JMF
1135 if get_video_info.get('use_cipher_signature') != ['True']:
1136 add_dash_mpd(get_video_info)
0a3cf9ad
S
1137 if not video_info:
1138 video_info = get_video_info
1139 if 'token' in get_video_info:
89ea063e
S
1140 # Different get_video_info requests may report different results, e.g.
1141 # some may report video unavailability, but some may serve it without
1142 # any complaint (see https://github.com/rg3/youtube-dl/issues/7362,
1143 # the original webpage as well as el=info and el=embedded get_video_info
1144 # requests report video unavailability due to geo restriction while
1145 # el=detailpage succeeds and returns valid data). This is probably
1146 # due to YouTube measures against IP ranges of hosting providers.
1147 # Working around by preferring the first succeeded video_info containing
1148 # the token if no such video_info yet was found.
44b2264f
S
1149 if 'token' not in video_info:
1150 video_info = get_video_info
4e62ebe2 1151 break
c5e8d7af
PH
1152 if 'token' not in video_info:
1153 if 'reason' in video_info:
af214c3a
YCH
1154 if 'The uploader has not made this video available in your country.' in video_info['reason']:
1155 regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None)
678e436f 1156 if regions_allowed:
af214c3a
YCH
1157 raise ExtractorError('YouTube said: This video is available in %s only' % (
1158 ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))),
1159 expected=True)
d11271dd 1160 raise ExtractorError(
78caa52a 1161 'YouTube said: %s' % video_info['reason'][0],
d11271dd 1162 expected=True, video_id=video_id)
c5e8d7af 1163 else:
d11271dd 1164 raise ExtractorError(
78caa52a 1165 '"token" parameter not in video info for unknown reason',
d11271dd 1166 video_id=video_id)
c5e8d7af 1167
cf7e015f
S
1168 # title
1169 if 'title' in video_info:
1170 video_title = video_info['title'][0]
1171 else:
1172 self._downloader.report_warning('Unable to extract video title')
1173 video_title = '_'
1174
1175 # description
1176 video_description = get_element_by_id("eow-description", video_webpage)
1177 if video_description:
1178 video_description = re.sub(r'''(?x)
1179 <a\s+
1180 (?:[a-zA-Z-]+="[^"]+"\s+)*?
23f13e97 1181 (?:title|href)="([^"]+)"\s+
cf7e015f 1182 (?:[a-zA-Z-]+="[^"]+"\s+)*?
096b5339 1183 class="(?:yt-uix-redirect-link|yt-uix-sessionlink[^"]*)"[^>]*>
23f13e97 1184 [^<]+\.{3}\s*
cf7e015f
S
1185 </a>
1186 ''', r'\1', video_description)
1187 video_description = clean_html(video_description)
1188 else:
1189 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1190 if fd_mobj:
1191 video_description = unescapeHTML(fd_mobj.group(1))
1192 else:
1193 video_description = ''
1194
5e1eddb9
S
1195 if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False):
1196 if not self._downloader.params.get('noplaylist'):
1197 entries = []
1198 feed_ids = []
6863631c 1199 multifeed_metadata_list = video_info['multifeed_metadata_list'][0]
5e1eddb9 1200 for feed in multifeed_metadata_list.split(','):
6863631c
S
1201 # Unquote should take place before split on comma (,) since textual
1202 # fields may contain comma as well (see
1203 # https://github.com/rg3/youtube-dl/issues/8536)
1204 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
5e1eddb9
S
1205 entries.append({
1206 '_type': 'url_transparent',
1207 'ie_key': 'Youtube',
1208 'url': smuggle_url(
1209 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1210 {'force_singlefeed': True}),
1211 'title': '%s (%s)' % (video_title, feed_data['title'][0]),
1212 })
1213 feed_ids.append(feed_data['id'][0])
1214 self.to_screen(
1215 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1216 % (', '.join(feed_ids), video_id))
1217 return self.playlist_result(entries, video_id, video_title, video_description)
1218 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
cf7e015f 1219
1d699755
PH
1220 if 'view_count' in video_info:
1221 view_count = int(video_info['view_count'][0])
1222 else:
1223 view_count = None
1224
c5e8d7af
PH
1225 # Check for "rental" videos
1226 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
69ea8ca4 1227 raise ExtractorError('"rental" videos not supported')
c5e8d7af
PH
1228
1229 # Start extracting information
1230 self.report_information_extraction(video_id)
1231
1232 # uploader
1233 if 'author' not in video_info:
69ea8ca4 1234 raise ExtractorError('Unable to extract uploader name')
7fd002c0 1235 video_uploader = compat_urllib_parse_unquote_plus(video_info['author'][0])
c5e8d7af
PH
1236
1237 # uploader_id
1238 video_uploader_id = None
1239 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1240 if mobj is not None:
1241 video_uploader_id = mobj.group(1)
1242 else:
69ea8ca4 1243 self._downloader.report_warning('unable to extract uploader nickname')
c5e8d7af 1244
c5e8d7af 1245 # thumbnail image
7763b04e
JMF
1246 # We try first to get a high quality image:
1247 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1248 video_webpage, re.DOTALL)
1249 if m_thumb is not None:
1250 video_thumbnail = m_thumb.group(1)
1251 elif 'thumbnail_url' not in video_info:
69ea8ca4 1252 self._downloader.report_warning('unable to extract video thumbnail')
f490e77e 1253 video_thumbnail = None
c5e8d7af 1254 else: # don't panic if we can't find it
7fd002c0 1255 video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
c5e8d7af
PH
1256
1257 # upload date
9d0b581f
S
1258 upload_date = self._html_search_meta(
1259 'datePublished', video_webpage, 'upload date', default=None)
1260 if not upload_date:
1261 upload_date = self._search_regex(
1262 [r'(?s)id="eow-date.*?>(.*?)</span>',
1263 r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'],
1264 video_webpage, 'upload date', default=None)
1265 if upload_date:
1266 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1267 upload_date = unified_strdate(upload_date)
c5e8d7af 1268
0cb58b02
S
1269 m_music = re.search(
1270 r'<h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*<ul[^>]*>\s*<li>(?P<title>.+?) by (?P<creator>.+?)(?:\(.+?\))?</li',
1271 video_webpage)
1272 if m_music:
1273 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
1274 video_creator = clean_html(m_music.group('creator'))
1275 else:
1276 video_alt_title = video_creator = None
1277
55f7bd2d
PH
1278 m_cat_container = self._search_regex(
1279 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
624dcebf 1280 video_webpage, 'categories', default=None)
ec8deefc 1281 if m_cat_container:
ad3bc6ac 1282 category = self._html_search_regex(
01ed5c9b 1283 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
1284 default=None)
1285 video_categories = None if category is None else [category]
1286 else:
1287 video_categories = None
ec8deefc 1288
000b6b5a
S
1289 video_tags = [
1290 unescapeHTML(m.group('content'))
1291 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
1292
f30a38be 1293 def _extract_count(count_name):
c93d53f5
S
1294 return str_to_int(self._search_regex(
1295 r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
1296 % re.escape(count_name),
1297 video_webpage, count_name, default=None))
1298
69ea8ca4
PH
1299 like_count = _extract_count('like')
1300 dislike_count = _extract_count('dislike')
336c3a69 1301
c5e8d7af 1302 # subtitles
d82134c3 1303 video_subtitles = self.extract_subtitles(video_id, video_webpage)
360e1ca5 1304 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
c5e8d7af
PH
1305
1306 if 'length_seconds' not in video_info:
69ea8ca4 1307 self._downloader.report_warning('unable to extract video duration')
b466b702 1308 video_duration = None
c5e8d7af 1309 else:
7fd002c0 1310 video_duration = int(compat_urllib_parse_unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1311
1fb07d10
JG
1312 # annotations
1313 video_annotations = None
1314 if self._downloader.params.get('writeannotations', False):
5f6a1245 1315 video_annotations = self._extract_annotations(video_id)
1fb07d10 1316
dd27fd17
PH
1317 def _map_to_format_list(urlmap):
1318 formats = []
1319 for itag, video_real_url in urlmap.items():
1320 dct = {
1321 'format_id': itag,
1322 'url': video_real_url,
1323 'player_url': player_url,
1324 }
0b65e5d4
PH
1325 if itag in self._formats:
1326 dct.update(self._formats[itag])
dd27fd17
PH
1327 formats.append(dct)
1328 return formats
1329
c5e8d7af
PH
1330 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1331 self.report_rtmp_download()
dd27fd17
PH
1332 formats = [{
1333 'format_id': '_rtmp',
1334 'protocol': 'rtmp',
1335 'url': video_info['conn'][0],
1336 'player_url': player_url,
1337 }]
24270b03 1338 elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
5f6a1245 1339 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 1340 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1341 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c9afb51c 1342 formats = []
00fe14fc 1343 for url_data_str in encoded_url_map.split(','):
c5e8d7af 1344 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
1345 if 'itag' not in url_data or 'url' not in url_data:
1346 continue
1347 format_id = url_data['itag'][0]
1348 url = url_data['url'][0]
1349
1350 if 'sig' in url_data:
1351 url += '&signature=' + url_data['sig'][0]
1352 elif 's' in url_data:
1353 encrypted_sig = url_data['s'][0]
6449cd80 1354 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
201e9eaa 1355
beb95e77 1356 jsplayer_url_json = self._search_regex(
6449cd80
PH
1357 ASSETS_RE,
1358 embed_webpage if age_gate else video_webpage,
1359 'JS player URL (1)', default=None)
1360 if not jsplayer_url_json and not age_gate:
1361 # We need the embed website after all
1362 if embed_webpage is None:
1363 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1364 embed_webpage = self._download_webpage(
1365 embed_url, video_id, 'Downloading embed webpage')
1366 jsplayer_url_json = self._search_regex(
1367 ASSETS_RE, embed_webpage, 'JS player URL')
1368
beb95e77 1369 player_url = json.loads(jsplayer_url_json)
201e9eaa
PH
1370 if player_url is None:
1371 player_url_json = self._search_regex(
1372 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 1373 video_webpage, 'age gate player URL')
201e9eaa
PH
1374 player_url = json.loads(player_url_json)
1375
1376 if self._downloader.params.get('verbose'):
cf010131 1377 if player_url is None:
201e9eaa
PH
1378 player_version = 'unknown'
1379 player_desc = 'unknown'
1380 else:
1381 if player_url.endswith('swf'):
1382 player_version = self._search_regex(
1383 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 1384 'flash player', fatal=False)
201e9eaa 1385 player_desc = 'flash player %s' % player_version
cf010131 1386 else:
201e9eaa 1387 player_version = self._search_regex(
50f84a9a 1388 [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', r'(?:www|player)-([^/]+)/base\.js'],
201e9eaa
PH
1389 player_url,
1390 'html5 player', fatal=False)
78caa52a 1391 player_desc = 'html5 player %s' % player_version
201e9eaa 1392
60064c53 1393 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 1394 self.to_screen('{%s} signature length %s, %s' %
9e1a5b84 1395 (format_id, parts_sizes, player_desc))
201e9eaa
PH
1396
1397 signature = self._decrypt_signature(
1398 encrypted_sig, video_id, player_url, age_gate)
1399 url += '&signature=' + signature
1400 if 'ratebypass' not in url:
1401 url += '&ratebypass=yes'
c9afb51c 1402
94278f72
YCH
1403 dct = {
1404 'format_id': format_id,
1405 'url': url,
1406 'player_url': player_url,
1407 }
1408 if format_id in self._formats:
1409 dct.update(self._formats[format_id])
1410
aabc2be6
S
1411 # Some itags are not included in DASH manifest thus corresponding formats will
1412 # lack metadata (see https://github.com/rg3/youtube-dl/pull/5993).
1413 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
1414 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
1415 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
94278f72
YCH
1416
1417 more_fields = {
c9afb51c 1418 'filesize': int_or_none(url_data.get('clen', [None])[0]),
aabc2be6 1419 'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000),
c9afb51c
AH
1420 'width': width,
1421 'height': height,
1422 'fps': int_or_none(url_data.get('fps', [None])[0]),
aabc2be6 1423 'format_note': url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0],
c9afb51c 1424 }
94278f72
YCH
1425 for key, value in more_fields.items():
1426 if value:
1427 dct[key] = value
aabc2be6
S
1428 type_ = url_data.get('type', [None])[0]
1429 if type_:
1430 type_split = type_.split(';')
1431 kind_ext = type_split[0].split('/')
1432 if len(kind_ext) == 2:
94278f72
YCH
1433 kind, _ = kind_ext
1434 dct['ext'] = mimetype2ext(type_split[0])
aabc2be6
S
1435 if kind in ('audio', 'video'):
1436 codecs = None
1437 for mobj in re.finditer(
1438 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
1439 if mobj.group('key') == 'codecs':
1440 codecs = mobj.group('val')
1441 break
1442 if codecs:
1443 codecs = codecs.split(',')
1444 if len(codecs) == 2:
cc28492d 1445 acodec, vcodec = codecs[1], codecs[0]
aabc2be6
S
1446 else:
1447 acodec, vcodec = (codecs[0], 'none') if kind == 'audio' else ('none', codecs[0])
1448 dct.update({
1449 'acodec': acodec,
1450 'vcodec': vcodec,
1451 })
aabc2be6 1452 formats.append(dct)
1d043b93
JMF
1453 elif video_info.get('hlsvp'):
1454 manifest_url = video_info['hlsvp'][0]
1455 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1456 formats = _map_to_format_list(url_map)
ac5a69af
YCH
1457 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
1458 for a_format in formats:
049d71d8 1459 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
c5e8d7af 1460 else:
8ceabd4d
S
1461 unavailable_message = self._html_search_regex(
1462 r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>',
1463 video_webpage, 'unavailable message', default=None)
1464 if unavailable_message:
1465 raise ExtractorError(unavailable_message, expected=True)
69ea8ca4 1466 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1467
dd27fd17 1468 # Look for the DASH manifest
203fb43f 1469 if self._downloader.params.get('youtube_include_dash_manifest', True):
77c6fb5b 1470 dash_mpd_fatal = True
8ff648e4 1471 for mpd_url in dash_mpds:
d8d24a92 1472 dash_formats = {}
774e208f 1473 try:
05d0d131
YCH
1474 def decrypt_sig(mobj):
1475 s = mobj.group(1)
1476 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
1477 return '/signature/%s' % dec_s
1478
8ff648e4 1479 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2d2fa82d 1480
8ff648e4 1481 for df in self._extract_mpd_formats(
1482 mpd_url, video_id, fatal=dash_mpd_fatal,
1483 formats_dict=self._formats):
d8d24a92
S
1484 # Do not overwrite DASH format found in some previous DASH manifest
1485 if df['format_id'] not in dash_formats:
1486 dash_formats[df['format_id']] = df
77c6fb5b
S
1487 # Additional DASH manifests may end up in HTTP Error 403 therefore
1488 # allow them to fail without bug report message if we already have
1489 # some DASH manifest succeeded. This is temporary workaround to reduce
1490 # burst of bug reports until we figure out the reason and whether it
1491 # can be fixed at all.
1492 dash_mpd_fatal = False
774e208f
PH
1493 except (ExtractorError, KeyError) as e:
1494 self.report_warning(
1495 'Skipping DASH manifest: %r' % e, video_id)
d8d24a92 1496 if dash_formats:
04b3b3df
JMF
1497 # Remove the formats we found through non-DASH, they
1498 # contain less info and it can be wrong, because we use
1499 # fixed values (for example the resolution). See
1500 # https://github.com/rg3/youtube-dl/issues/5774 for an
1501 # example.
d80265cc 1502 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
d8d24a92 1503 formats.extend(dash_formats.values())
d80044c2 1504
6271f1ca
PH
1505 # Check for malformed aspect ratio
1506 stretched_m = re.search(
1507 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
1508 video_webpage)
1509 if stretched_m:
313dfc45
LL
1510 w = float(stretched_m.group('w'))
1511 h = float(stretched_m.group('h'))
5faf9fed
S
1512 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
1513 # We will only process correct ratios.
313dfc45 1514 if w > 0 and h > 0:
41f24c32 1515 ratio = w / h
313dfc45
LL
1516 for f in formats:
1517 if f.get('vcodec') != 'none':
1518 f['stretched_ratio'] = ratio
6271f1ca 1519
4bcc7bd1 1520 self._sort_formats(formats)
4ea3be0a 1521
1522 return {
8bcc8756
JW
1523 'id': video_id,
1524 'uploader': video_uploader,
1525 'uploader_id': video_uploader_id,
1526 'upload_date': upload_date,
0cb58b02 1527 'creator': video_creator,
8bcc8756 1528 'title': video_title,
0cb58b02 1529 'alt_title': video_alt_title,
8bcc8756
JW
1530 'thumbnail': video_thumbnail,
1531 'description': video_description,
1532 'categories': video_categories,
000b6b5a 1533 'tags': video_tags,
8bcc8756 1534 'subtitles': video_subtitles,
360e1ca5 1535 'automatic_captions': automatic_captions,
8bcc8756
JW
1536 'duration': video_duration,
1537 'age_limit': 18 if age_gate else 0,
1538 'annotations': video_annotations,
7e8c0af0 1539 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 1540 'view_count': view_count,
4ea3be0a 1541 'like_count': like_count,
1542 'dislike_count': dislike_count,
2d30521a 1543 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
8bcc8756 1544 'formats': formats,
2fe1ff85 1545 'is_live': is_live,
7c80519c 1546 'start_time': start_time,
297a564b 1547 'end_time': end_time,
4ea3be0a 1548 }
c5e8d7af 1549
5f6a1245 1550
8e7aad20 1551class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
78caa52a 1552 IE_DESC = 'YouTube.com playlists'
d67cc9fa 1553 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1554 (?:https?://)?
1555 (?:\w+\.)?
1556 youtube\.com/
1557 (?:
ac7553d0 1558 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
2e1b9285 1559 \? (?:.*?[&;])*? (?:p|a|list)=
c5e8d7af
PH
1560 | p/
1561 )
d67cc9fa 1562 (
99209c29 1563 (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,}
5f6a1245 1564 # Top tracks, they can also include dots
d67cc9fa
JMF
1565 |(?:MC)[\w\.]*
1566 )
c5e8d7af
PH
1567 .*
1568 |
99209c29 1569 ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})
c5e8d7af 1570 )"""
dbb94fb0 1571 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
648e6a1f 1572 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'
78caa52a 1573 IE_NAME = 'youtube:playlist'
81127aa5
PH
1574 _TESTS = [{
1575 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1576 'info_dict': {
1577 'title': 'ytdl test PL',
a1cf99d0 1578 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
81127aa5
PH
1579 },
1580 'playlist_count': 3,
9291475f
PH
1581 }, {
1582 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1583 'info_dict': {
acf757f4 1584 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
9291475f
PH
1585 'title': 'YDL_Empty_List',
1586 },
1587 'playlist_count': 0,
1588 }, {
1589 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1590 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1591 'info_dict': {
1592 'title': '29C3: Not my department',
acf757f4 1593 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
9291475f
PH
1594 },
1595 'playlist_count': 95,
1596 }, {
1597 'note': 'issue #673',
1598 'url': 'PLBB231211A4F62143',
1599 'info_dict': {
f46a8702 1600 'title': '[OLD]Team Fortress 2 (Class-based LP)',
acf757f4 1601 'id': 'PLBB231211A4F62143',
9291475f
PH
1602 },
1603 'playlist_mincount': 26,
1604 }, {
1605 'note': 'Large playlist',
1606 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1607 'info_dict': {
1608 'title': 'Uploads from Cauchemar',
acf757f4 1609 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
9291475f
PH
1610 },
1611 'playlist_mincount': 799,
1612 }, {
1613 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1614 'info_dict': {
1615 'title': 'YDL_safe_search',
acf757f4 1616 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
9291475f
PH
1617 },
1618 'playlist_count': 2,
ac7553d0
PH
1619 }, {
1620 'note': 'embedded',
1621 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1622 'playlist_count': 4,
1623 'info_dict': {
1624 'title': 'JODA15',
acf757f4 1625 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
ac7553d0 1626 }
6b08cdf6
PH
1627 }, {
1628 'note': 'Embedded SWF player',
1629 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1630 'playlist_count': 4,
1631 'info_dict': {
1632 'title': 'JODA7',
acf757f4 1633 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
6b08cdf6 1634 }
4b7df0d3
JMF
1635 }, {
1636 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
1637 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
1638 'info_dict': {
acf757f4
PH
1639 'title': 'Uploads from Interstellar Movie',
1640 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
4b7df0d3
JMF
1641 },
1642 'playlist_mincout': 21,
81127aa5 1643 }]
c5e8d7af 1644
880e1c52
JMF
1645 def _real_initialize(self):
1646 self._login()
1647
652cdaa2 1648 def _extract_mix(self, playlist_id):
99209c29 1649 # The mixes are generated from a single video
652cdaa2 1650 # the id of the playlist is just 'RD' + video_id
7d4afc55 1651 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
c9cc0bf5 1652 webpage = self._download_webpage(
78caa52a 1653 url, playlist_id, 'Downloading Youtube mix')
bc2f773b 1654 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
1655 title_span = (
1656 search_title('playlist-title') or
1657 search_title('title long-title') or
1658 search_title('title'))
76d1700b 1659 title = clean_html(title_span)
c9cc0bf5
PH
1660 ids = orderedSet(re.findall(
1661 r'''(?xs)data-video-username=".*?".*?
1662 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1663 webpage))
652cdaa2
JMF
1664 url_results = self._ids_to_results(ids)
1665
1666 return self.playlist_result(url_results, playlist_id, title)
1667
448830ce 1668 def _extract_playlist(self, playlist_id):
dbb94fb0
S
1669 url = self._TEMPLATE_URL % playlist_id
1670 page = self._download_webpage(url, playlist_id)
dbb94fb0 1671
39b62db1
YCH
1672 for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page):
1673 match = match.strip()
1674 # Check if the playlist exists or is private
1675 if re.match(r'[^<]*(The|This) playlist (does not exist|is private)[^<]*', match):
1676 raise ExtractorError(
1677 'The playlist doesn\'t exist or is private, use --username or '
1678 '--netrc to access it.',
1679 expected=True)
1680 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
1681 raise ExtractorError(
1682 'Invalid parameters. Maybe URL is incorrect.',
1683 expected=True)
1684 elif re.match(r'[^<]*Choose your language[^<]*', match):
1685 continue
1686 else:
1687 self.report_warning('Youtube gives an alert message: ' + match)
10c0e2d8 1688
dbb94fb0 1689 playlist_title = self._html_search_regex(
63b4295d 1690 r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
78caa52a 1691 page, 'title')
c5e8d7af 1692
648e6a1f 1693 return self.playlist_result(self._entries(page, playlist_id), playlist_id, playlist_title)
c5e8d7af 1694
448830ce
S
1695 def _real_extract(self, url):
1696 # Extract playlist id
1697 mobj = re.match(self._VALID_URL, url)
1698 if mobj is None:
1699 raise ExtractorError('Invalid URL: %s' % url)
1700 playlist_id = mobj.group(1) or mobj.group(2)
1701
1702 # Check if it's a video-specific URL
1703 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1704 if 'v' in query_dict:
1705 video_id = query_dict['v'][0]
1706 if self._downloader.params.get('noplaylist'):
1707 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1708 return self.url_result(video_id, 'Youtube', video_id=video_id)
1709 else:
1710 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1711
1712 if playlist_id.startswith('RD') or playlist_id.startswith('UL'):
1713 # Mixes require a custom extraction process
1714 return self._extract_mix(playlist_id)
1715
1716 return self._extract_playlist(playlist_id)
1717
c5e8d7af 1718
648e6a1f 1719class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
78caa52a 1720 IE_DESC = 'YouTube.com channels'
9ff67727 1721 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
eb0f3e7e 1722 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
648e6a1f 1723 _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
78caa52a 1724 IE_NAME = 'youtube:channel'
cdc628a4
PH
1725 _TESTS = [{
1726 'note': 'paginated channel',
1727 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1728 'playlist_mincount': 91,
acf757f4 1729 'info_dict': {
9170ca5b
JMF
1730 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
1731 'title': 'Uploads from lex will',
acf757f4 1732 }
5c43afd4
JMF
1733 }, {
1734 'note': 'Age restricted channel',
1735 # from https://www.youtube.com/user/DeusExOfficial
1736 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',
1737 'playlist_mincount': 64,
1738 'info_dict': {
1739 'id': 'UUs0ifCMCm1icqRbqhUINa0w',
1740 'title': 'Uploads from Deus Ex',
1741 },
cdc628a4 1742 }]
c5e8d7af 1743
e462474e
S
1744 @classmethod
1745 def suitable(cls, url):
1746 return False if YoutubePlaylistsIE.suitable(url) else super(YoutubeChannelIE, cls).suitable(url)
1747
c5e8d7af 1748 def _real_extract(self, url):
9ff67727 1749 channel_id = self._match_id(url)
c5e8d7af 1750
eb0f3e7e 1751 url = self._TEMPLATE_URL % channel_id
386bdfa6
S
1752
1753 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
1754 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
1755 # otherwise fallback on channel by page extraction
1756 channel_page = self._download_webpage(
1757 url + '?view=57', channel_id,
1758 'Downloading channel page', fatal=False)
2b3c2546
PH
1759 if channel_page is False:
1760 channel_playlist_id = False
1761 else:
1762 channel_playlist_id = self._html_search_meta(
1763 'channelId', channel_page, 'channel id', default=None)
1764 if not channel_playlist_id:
1765 channel_playlist_id = self._search_regex(
5c43afd4 1766 r'data-(?:channel-external-|yt)id="([^"]+)"',
2b3c2546 1767 channel_page, 'channel id', default=None)
386bdfa6
S
1768 if channel_playlist_id and channel_playlist_id.startswith('UC'):
1769 playlist_id = 'UU' + channel_playlist_id[2:]
d2a9de78
IK
1770 return self.url_result(
1771 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
386bdfa6 1772
60bf45c8 1773 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
31812a9e
PH
1774 autogenerated = re.search(r'''(?x)
1775 class="[^"]*?(?:
1776 channel-header-autogenerated-label|
1777 yt-channel-title-autogenerated
1778 )[^"]*"''', channel_page) is not None
c5e8d7af 1779
b9643eed
JMF
1780 if autogenerated:
1781 # The videos are contained in a single page
1782 # the ajax pages can't be used, they are empty
b82f815f 1783 entries = [
fb69240c
S
1784 self.url_result(
1785 video_id, 'Youtube', video_id=video_id,
1786 video_title=video_title)
8f02ad4f 1787 for video_id, video_title in self.extract_videos_from_page(channel_page)]
b82f815f
PH
1788 return self.playlist_result(entries, channel_id)
1789
648e6a1f 1790 return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
c5e8d7af
PH
1791
1792
eb0f3e7e 1793class YoutubeUserIE(YoutubeChannelIE):
78caa52a 1794 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
70029bc3 1795 _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
eb0f3e7e 1796 _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos'
78caa52a 1797 IE_NAME = 'youtube:user'
c5e8d7af 1798
cdc628a4
PH
1799 _TESTS = [{
1800 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1801 'playlist_mincount': 320,
1802 'info_dict': {
1803 'title': 'TheLinuxFoundation',
1804 }
1805 }, {
1806 'url': 'ytuser:phihag',
1807 'only_matching': True,
1808 }]
1809
e3ea4790 1810 @classmethod
f4b05232 1811 def suitable(cls, url):
e3ea4790
JMF
1812 # Don't return True if the url can be extracted with other youtube
1813 # extractor, the regex would is too permissive and it would match.
1814 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
5f6a1245
JW
1815 if any(ie.suitable(url) for ie in other_ies):
1816 return False
1817 else:
1818 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 1819
b05654f0 1820
e462474e
S
1821class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
1822 IE_DESC = 'YouTube.com user/channel playlists'
1823 _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists'
1824 IE_NAME = 'youtube:playlists'
0c148415 1825
e568c223 1826 _TESTS = [{
0c148415
S
1827 'url': 'http://www.youtube.com/user/ThirstForScience/playlists',
1828 'playlist_mincount': 4,
1829 'info_dict': {
1830 'id': 'ThirstForScience',
1831 'title': 'Thirst for Science',
1832 },
e568c223
S
1833 }, {
1834 # with "Load more" button
1835 'url': 'http://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
1836 'playlist_mincount': 70,
1837 'info_dict': {
1838 'id': 'igorkle1',
1839 'title': 'Игорь Клейнер',
1840 },
e462474e
S
1841 }, {
1842 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',
1843 'playlist_mincount': 17,
1844 'info_dict': {
1845 'id': 'UCiU1dHvZObB2iP6xkJ__Icw',
1846 'title': 'Chem Player',
1847 },
e568c223 1848 }]
0c148415
S
1849
1850
b4c08069 1851class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
78caa52a 1852 IE_DESC = 'YouTube.com searches'
b4c08069
JMF
1853 # there doesn't appear to be a real limit, for example if you search for
1854 # 'python' you get more than 8.000.000 results
1855 _MAX_RESULTS = float('inf')
78caa52a 1856 IE_NAME = 'youtube:search'
b05654f0 1857 _SEARCH_KEY = 'ytsearch'
b4c08069 1858 _EXTRA_QUERY_ARGS = {}
9dd8e46a 1859 _TESTS = []
b05654f0 1860
b05654f0
PH
1861 def _get_n_results(self, query, n):
1862 """Get a specified number of results for a query"""
1863
b4c08069 1864 videos = []
b05654f0
PH
1865 limit = n
1866
b4c08069
JMF
1867 for pagenum in itertools.count(1):
1868 url_query = {
02175a79 1869 'search_query': query.encode('utf-8'),
b4c08069
JMF
1870 'page': pagenum,
1871 'spf': 'navigate',
1872 }
1873 url_query.update(self._EXTRA_QUERY_ARGS)
1874 result_url = 'https://www.youtube.com/results?' + compat_urllib_parse.urlencode(url_query)
1875 data = self._download_json(
69ea8ca4 1876 result_url, video_id='query "%s"' % query,
b4c08069 1877 note='Downloading page %s' % pagenum,
69ea8ca4 1878 errnote='Unable to download API page')
b4c08069 1879 html_content = data[1]['body']['content']
7cc3570e 1880
b4c08069 1881 if 'class="search-message' in html_content:
07ad22b8 1882 raise ExtractorError(
78caa52a 1883 '[youtube] No video results', expected=True)
b05654f0 1884
b4c08069
JMF
1885 new_videos = self._ids_to_results(orderedSet(re.findall(
1886 r'href="/watch\?v=(.{11})', html_content)))
1887 videos += new_videos
1888 if not new_videos or len(videos) > limit:
1889 break
b05654f0 1890
b4c08069
JMF
1891 if len(videos) > n:
1892 videos = videos[:n]
b05654f0 1893 return self.playlist_result(videos, query)
75dff0ee 1894
c9ae7b95 1895
a3dd9248 1896class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1897 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248 1898 _SEARCH_KEY = 'ytsearchdate'
78caa52a 1899 IE_DESC = 'YouTube.com searches, newest videos first'
b4c08069 1900 _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
75dff0ee 1901
c9ae7b95
PH
1902
1903class YoutubeSearchURLIE(InfoExtractor):
78caa52a
PH
1904 IE_DESC = 'YouTube.com search URLs'
1905 IE_NAME = 'youtube:search_url'
c9ae7b95 1906 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
1907 _TESTS = [{
1908 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1909 'playlist_mincount': 5,
1910 'info_dict': {
1911 'title': 'youtube-dl test video',
1912 }
1913 }]
c9ae7b95
PH
1914
1915 def _real_extract(self, url):
1916 mobj = re.match(self._VALID_URL, url)
7fd002c0 1917 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
c9ae7b95
PH
1918
1919 webpage = self._download_webpage(url, query)
1920 result_code = self._search_regex(
98998cde 1921 r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML')
c9ae7b95
PH
1922
1923 part_codes = re.findall(
f74a7348 1924 r'(?s)<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*>(.*?)</h3>', result_code)
c9ae7b95
PH
1925 entries = []
1926 for part_code in part_codes:
1927 part_title = self._html_search_regex(
6feb2d5e 1928 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1929 part_url_snippet = self._html_search_regex(
1930 r'(?s)href="([^"]+)"', part_code, 'item URL')
1931 part_url = compat_urlparse.urljoin(
1932 'https://www.youtube.com/', part_url_snippet)
1933 entries.append({
1934 '_type': 'url',
1935 'url': part_url,
1936 'title': part_title,
1937 })
1938
1939 return {
1940 '_type': 'playlist',
1941 'entries': entries,
1942 'title': query,
1943 }
1944
1945
136dadde 1946class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
78caa52a 1947 IE_DESC = 'YouTube.com (multi-season) shows'
cdc628a4 1948 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 1949 IE_NAME = 'youtube:show'
cdc628a4 1950 _TESTS = [{
4003bd82 1951 'url': 'https://www.youtube.com/show/airdisasters',
8801255d 1952 'playlist_mincount': 5,
cdc628a4
PH
1953 'info_dict': {
1954 'id': 'airdisasters',
1955 'title': 'Air Disasters',
1956 }
1957 }]
75dff0ee
JMF
1958
1959 def _real_extract(self, url):
136dadde
S
1960 playlist_id = self._match_id(url)
1961 return super(YoutubeShowIE, self)._real_extract(
1962 'https://www.youtube.com/show/%s/playlists' % playlist_id)
04cc9617
JMF
1963
1964
b2e8bc1b 1965class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639 1966 """
25f14e9f 1967 Base class for feed extractors
d7ae0639
JMF
1968 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1969 """
b2e8bc1b 1970 _LOGIN_REQUIRED = True
d7ae0639
JMF
1971
1972 @property
1973 def IE_NAME(self):
78caa52a 1974 return 'youtube:%s' % self._FEED_NAME
04cc9617 1975
81f0259b 1976 def _real_initialize(self):
b2e8bc1b 1977 self._login()
81f0259b 1978
04cc9617 1979 def _real_extract(self, url):
25f14e9f
S
1980 page = self._download_webpage(
1981 'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)
2bc43303
JMF
1982
1983 # The extraction process is the same as for playlists, but the regex
1984 # for the video ids doesn't contain an index
1985 ids = []
1986 more_widget_html = content_html = page
2bc43303
JMF
1987 for page_num in itertools.count(1):
1988 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
62c95fd5
S
1989
1990 # 'recommended' feed has infinite 'load more' and each new portion spins
1991 # the same videos in (sometimes) slightly different order, so we'll check
1992 # for unicity and break when portion has no new videos
1993 new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))
1994 if not new_ids:
1995 break
1996
2bc43303
JMF
1997 ids.extend(new_ids)
1998
1999 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
2000 if not mobj:
2001 break
2002
2003 more = self._download_json(
25f14e9f 2004 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
2bc43303
JMF
2005 'Downloading page #%s' % page_num,
2006 transform_source=uppercase_escape)
2007 content_html = more['content_html']
2008 more_widget_html = more['load_more_widget_html']
2009
25f14e9f
S
2010 return self.playlist_result(
2011 self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE)
2012
2013
2014class YoutubeWatchLaterIE(YoutubePlaylistIE):
2015 IE_NAME = 'youtube:watchlater'
2016 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
2017 _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'
2018
2019 _TESTS = [] # override PlaylistIE tests
2020
2021 def _real_extract(self, url):
2022 return self._extract_playlist('WL')
f459d170 2023
5f6a1245 2024
c626a3d9 2025class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 2026 IE_NAME = 'youtube:favorites'
f3a34072 2027 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
c7a7750d 2028 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
2029 _LOGIN_REQUIRED = True
2030
2031 def _real_extract(self, url):
2032 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 2033 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 2034 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
2035
2036
25f14e9f
S
2037class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
2038 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
2039 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
2040 _FEED_NAME = 'recommended'
2041 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1ed5b5c9 2042
1ed5b5c9 2043
25f14e9f
S
2044class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
2045 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
2046 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
2047 _FEED_NAME = 'subscriptions'
2048 _PLAYLIST_TITLE = 'Youtube Subscriptions'
1ed5b5c9 2049
1ed5b5c9 2050
25f14e9f
S
2051class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
2052 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
2053 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
2054 _FEED_NAME = 'history'
2055 _PLAYLIST_TITLE = 'Youtube History'
1ed5b5c9
JMF
2056
2057
15870e90
PH
2058class YoutubeTruncatedURLIE(InfoExtractor):
2059 IE_NAME = 'youtube:truncated_url'
2060 IE_DESC = False # Do not list
975d35db 2061 _VALID_URL = r'''(?x)
b95aab84
PH
2062 (?:https?://)?
2063 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
2064 (?:watch\?(?:
c4808c60 2065 feature=[a-z_]+|
b95aab84
PH
2066 annotation_id=annotation_[^&]+|
2067 x-yt-cl=[0-9]+|
c1708b89 2068 hl=[^&]*|
287be8c6 2069 t=[0-9]+
b95aab84
PH
2070 )?
2071 |
2072 attribution_link\?a=[^&]+
2073 )
2074 $
975d35db 2075 '''
15870e90 2076
c4808c60
PH
2077 _TESTS = [{
2078 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
2079 'only_matching': True,
dc2fc736
PH
2080 }, {
2081 'url': 'http://www.youtube.com/watch?',
2082 'only_matching': True,
b95aab84
PH
2083 }, {
2084 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
2085 'only_matching': True,
2086 }, {
2087 'url': 'https://www.youtube.com/watch?feature=foo',
2088 'only_matching': True,
c1708b89
PH
2089 }, {
2090 'url': 'https://www.youtube.com/watch?hl=en-GB',
2091 'only_matching': True,
287be8c6
PH
2092 }, {
2093 'url': 'https://www.youtube.com/watch?t=2372',
2094 'only_matching': True,
c4808c60
PH
2095 }]
2096
15870e90
PH
2097 def _real_extract(self, url):
2098 raise ExtractorError(
78caa52a
PH
2099 'Did you forget to quote the URL? Remember that & is a meta '
2100 'character in most shells, so you want to put the URL in quotes, '
2101 'like youtube-dl '
2102 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
2103 ' or simply youtube-dl BaW_jenozKc .',
15870e90 2104 expected=True)
772fd5cc
PH
2105
2106
2107class YoutubeTruncatedIDIE(InfoExtractor):
2108 IE_NAME = 'youtube:truncated_id'
2109 IE_DESC = False # Do not list
b95aab84 2110 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
2111
2112 _TESTS = [{
2113 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
2114 'only_matching': True,
2115 }]
2116
2117 def _real_extract(self, url):
2118 video_id = self._match_id(url)
2119 raise ExtractorError(
2120 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
2121 expected=True)