]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/youtube.py
Revert "pull changes from remote master (#190)" (#193)
[yt-dlp.git] / youtube_dl / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5
6 import itertools
7 import json
8 import os.path
9 import random
10 import re
11 import time
12 import traceback
13
14 from .common import InfoExtractor, SearchInfoExtractor
15 from ..jsinterp import JSInterpreter
16 from ..swfinterp import SWFInterpreter
17 from ..compat import (
18 compat_chr,
19 compat_HTTPError,
20 compat_kwargs,
21 compat_parse_qs,
22 compat_urllib_parse_unquote,
23 compat_urllib_parse_unquote_plus,
24 compat_urllib_parse_urlencode,
25 compat_urllib_parse_urlparse,
26 compat_urlparse,
27 compat_str,
28 )
29 from ..utils import (
30 bool_or_none,
31 clean_html,
32 dict_get,
33 error_to_compat_str,
34 extract_attributes,
35 ExtractorError,
36 float_or_none,
37 get_element_by_attribute,
38 get_element_by_id,
39 int_or_none,
40 mimetype2ext,
41 orderedSet,
42 parse_codecs,
43 parse_duration,
44 remove_quotes,
45 remove_start,
46 smuggle_url,
47 str_or_none,
48 str_to_int,
49 try_get,
50 unescapeHTML,
51 unified_strdate,
52 unsmuggle_url,
53 uppercase_escape,
54 url_or_none,
55 urlencode_postdata,
56 )
57
58
59 class YoutubeBaseInfoExtractor(InfoExtractor):
60 """Provide base functions for Youtube extractors"""
61 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
62 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
63
64 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
65 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
66 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
67
68 _NETRC_MACHINE = 'youtube'
69 # If True it will raise an error if no login info is provided
70 _LOGIN_REQUIRED = False
71
72 _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}'
73
74 def _set_language(self):
75 self._set_cookie(
76 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
77 # YouTube sets the expire time to about two months
78 expire_time=time.time() + 2 * 30 * 24 * 3600)
79
80 def _ids_to_results(self, ids):
81 return [
82 self.url_result(vid_id, 'Youtube', video_id=vid_id)
83 for vid_id in ids]
84
85 def _login(self):
86 """
87 Attempt to log in to YouTube.
88 True is returned if successful or skipped.
89 False is returned if login failed.
90
91 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
92 """
93 username, password = self._get_login_info()
94 # No authentication to be performed
95 if username is None:
96 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
97 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
98 return True
99
100 login_page = self._download_webpage(
101 self._LOGIN_URL, None,
102 note='Downloading login page',
103 errnote='unable to fetch login page', fatal=False)
104 if login_page is False:
105 return
106
107 login_form = self._hidden_inputs(login_page)
108
109 def req(url, f_req, note, errnote):
110 data = login_form.copy()
111 data.update({
112 'pstMsg': 1,
113 'checkConnection': 'youtube',
114 'checkedDomains': 'youtube',
115 'hl': 'en',
116 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
117 'f.req': json.dumps(f_req),
118 'flowName': 'GlifWebSignIn',
119 'flowEntry': 'ServiceLogin',
120 # TODO: reverse actual botguard identifier generation algo
121 'bgRequest': '["identifier",""]',
122 })
123 return self._download_json(
124 url, None, note=note, errnote=errnote,
125 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
126 fatal=False,
127 data=urlencode_postdata(data), headers={
128 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
129 'Google-Accounts-XSRF': 1,
130 })
131
132 def warn(message):
133 self._downloader.report_warning(message)
134
135 lookup_req = [
136 username,
137 None, [], None, 'US', None, None, 2, False, True,
138 [
139 None, None,
140 [2, 1, None, 1,
141 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
142 None, [], 4],
143 1, [None, None, []], None, None, None, True
144 ],
145 username,
146 ]
147
148 lookup_results = req(
149 self._LOOKUP_URL, lookup_req,
150 'Looking up account info', 'Unable to look up account info')
151
152 if lookup_results is False:
153 return False
154
155 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
156 if not user_hash:
157 warn('Unable to extract user hash')
158 return False
159
160 challenge_req = [
161 user_hash,
162 None, 1, None, [1, None, None, None, [password, None, True]],
163 [
164 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
165 1, [None, None, []], None, None, None, True
166 ]]
167
168 challenge_results = req(
169 self._CHALLENGE_URL, challenge_req,
170 'Logging in', 'Unable to log in')
171
172 if challenge_results is False:
173 return
174
175 login_res = try_get(challenge_results, lambda x: x[0][5], list)
176 if login_res:
177 login_msg = try_get(login_res, lambda x: x[5], compat_str)
178 warn(
179 'Unable to login: %s' % 'Invalid password'
180 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
181 return False
182
183 res = try_get(challenge_results, lambda x: x[0][-1], list)
184 if not res:
185 warn('Unable to extract result entry')
186 return False
187
188 login_challenge = try_get(res, lambda x: x[0][0], list)
189 if login_challenge:
190 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
191 if challenge_str == 'TWO_STEP_VERIFICATION':
192 # SEND_SUCCESS - TFA code has been successfully sent to phone
193 # QUOTA_EXCEEDED - reached the limit of TFA codes
194 status = try_get(login_challenge, lambda x: x[5], compat_str)
195 if status == 'QUOTA_EXCEEDED':
196 warn('Exceeded the limit of TFA codes, try later')
197 return False
198
199 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
200 if not tl:
201 warn('Unable to extract TL')
202 return False
203
204 tfa_code = self._get_tfa_info('2-step verification code')
205
206 if not tfa_code:
207 warn(
208 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
209 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
210 return False
211
212 tfa_code = remove_start(tfa_code, 'G-')
213
214 tfa_req = [
215 user_hash, None, 2, None,
216 [
217 9, None, None, None, None, None, None, None,
218 [None, tfa_code, True, 2]
219 ]]
220
221 tfa_results = req(
222 self._TFA_URL.format(tl), tfa_req,
223 'Submitting TFA code', 'Unable to submit TFA code')
224
225 if tfa_results is False:
226 return False
227
228 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
229 if tfa_res:
230 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
231 warn(
232 'Unable to finish TFA: %s' % 'Invalid TFA code'
233 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
234 return False
235
236 check_cookie_url = try_get(
237 tfa_results, lambda x: x[0][-1][2], compat_str)
238 else:
239 CHALLENGES = {
240 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
241 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
242 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
243 }
244 challenge = CHALLENGES.get(
245 challenge_str,
246 '%s returned error %s.' % (self.IE_NAME, challenge_str))
247 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
248 return False
249 else:
250 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
251
252 if not check_cookie_url:
253 warn('Unable to extract CheckCookie URL')
254 return False
255
256 check_cookie_results = self._download_webpage(
257 check_cookie_url, None, 'Checking cookie', fatal=False)
258
259 if check_cookie_results is False:
260 return False
261
262 if 'https://myaccount.google.com/' not in check_cookie_results:
263 warn('Unable to log in')
264 return False
265
266 return True
267
268 def _download_webpage_handle(self, *args, **kwargs):
269 query = kwargs.get('query', {}).copy()
270 query['disable_polymer'] = 'true'
271 kwargs['query'] = query
272 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
273 *args, **compat_kwargs(kwargs))
274
275 def _real_initialize(self):
276 if self._downloader is None:
277 return
278 self._set_language()
279 if not self._login():
280 return
281
282
283 class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
284 # Extract entries from page with "Load more" button
285 def _entries(self, page, playlist_id):
286 more_widget_html = content_html = page
287 for page_num in itertools.count(1):
288 for entry in self._process_page(content_html):
289 yield entry
290
291 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
292 if not mobj:
293 break
294
295 count = 0
296 retries = 3
297 while count <= retries:
298 try:
299 # Downloading page may result in intermittent 5xx HTTP error
300 # that is usually worked around with a retry
301 more = self._download_json(
302 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
303 'Downloading page #%s%s'
304 % (page_num, ' (retry #%d)' % count if count else ''),
305 transform_source=uppercase_escape)
306 break
307 except ExtractorError as e:
308 if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
309 count += 1
310 if count <= retries:
311 continue
312 raise
313
314 content_html = more['content_html']
315 if not content_html.strip():
316 # Some webpages show a "Load more" button but they don't
317 # have more videos
318 break
319 more_widget_html = more['load_more_widget_html']
320
321
322 class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
323 def _process_page(self, content):
324 for video_id, video_title in self.extract_videos_from_page(content):
325 yield self.url_result(video_id, 'Youtube', video_id, video_title)
326
327 def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page):
328 for mobj in re.finditer(video_re, page):
329 # The link with index 0 is not the first video of the playlist (not sure if still actual)
330 if 'index' in mobj.groupdict() and mobj.group('id') == '0':
331 continue
332 video_id = mobj.group('id')
333 video_title = unescapeHTML(
334 mobj.group('title')) if 'title' in mobj.groupdict() else None
335 if video_title:
336 video_title = video_title.strip()
337 if video_title == '► Play all':
338 video_title = None
339 try:
340 idx = ids_in_page.index(video_id)
341 if video_title and not titles_in_page[idx]:
342 titles_in_page[idx] = video_title
343 except ValueError:
344 ids_in_page.append(video_id)
345 titles_in_page.append(video_title)
346
347 def extract_videos_from_page(self, page):
348 ids_in_page = []
349 titles_in_page = []
350 self.extract_videos_from_page_impl(
351 self._VIDEO_RE, page, ids_in_page, titles_in_page)
352 return zip(ids_in_page, titles_in_page)
353
354
355 class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
356 def _process_page(self, content):
357 for playlist_id in orderedSet(re.findall(
358 r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',
359 content)):
360 yield self.url_result(
361 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
362
363 def _real_extract(self, url):
364 playlist_id = self._match_id(url)
365 webpage = self._download_webpage(url, playlist_id)
366 title = self._og_search_title(webpage, fatal=False)
367 return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
368
369
370 class YoutubeIE(YoutubeBaseInfoExtractor):
371 IE_DESC = 'YouTube.com'
372 _VALID_URL = r"""(?x)^
373 (
374 (?:https?://|//) # http(s):// or protocol-independent URL
375 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
376 (?:www\.)?deturl\.com/www\.youtube\.com/|
377 (?:www\.)?pwnyoutube\.com/|
378 (?:www\.)?hooktube\.com/|
379 (?:www\.)?yourepeat\.com/|
380 tube\.majestyc\.net/|
381 # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
382 (?:(?:www|dev)\.)?invidio\.us/|
383 (?:(?:www|no)\.)?invidiou\.sh/|
384 (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/|
385 (?:www\.)?invidious\.kabi\.tk/|
386 (?:www\.)?invidious\.13ad\.de/|
387 (?:www\.)?invidious\.mastodon\.host/|
388 (?:www\.)?invidious\.nixnet\.xyz/|
389 (?:www\.)?invidious\.drycat\.fr/|
390 (?:www\.)?tube\.poal\.co/|
391 (?:www\.)?vid\.wxzm\.sx/|
392 (?:www\.)?yt\.elukerio\.org/|
393 (?:www\.)?yt\.lelux\.fi/|
394 (?:www\.)?kgg2m7yk5aybusll\.onion/|
395 (?:www\.)?qklhadlycap4cnod\.onion/|
396 (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
397 (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
398 (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
399 (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
400 (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
401 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
402 (?:.*?\#/)? # handle anchor (#/) redirect urls
403 (?: # the various things that can precede the ID:
404 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
405 |(?: # or the v= param in all its forms
406 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
407 (?:\?|\#!?) # the params delimiter ? or # or #!
408 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
409 v=
410 )
411 ))
412 |(?:
413 youtu\.be| # just youtu.be/xxxx
414 vid\.plus| # or vid.plus/xxxx
415 zwearz\.com/watch| # or zwearz.com/watch/xxxx
416 )/
417 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
418 )
419 )? # all until now is optional -> you can pass the naked ID
420 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
421 (?!.*?\blist=
422 (?:
423 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
424 WL # WL are handled by the watch later IE
425 )
426 )
427 (?(1).+)? # if we found the ID, everything can follow
428 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
429 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
430 _formats = {
431 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
432 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
433 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
434 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
435 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
436 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
437 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
438 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
439 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
440 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
441 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
442 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
443 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
444 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
445 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
446 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
447 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
448 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
449
450
451 # 3D videos
452 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
453 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
454 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
455 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
456 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
457 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
458 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
459
460 # Apple HTTP Live Streaming
461 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
462 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
463 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
464 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
465 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
466 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
467 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
468 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
469
470 # DASH mp4 video
471 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
472 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
473 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
474 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
475 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
476 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
477 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
478 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
479 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
480 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
481 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
482 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
483
484 # Dash mp4 audio
485 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
486 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
487 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
488 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
489 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
490 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
491 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
492
493 # Dash webm
494 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
495 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
496 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
497 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
498 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
499 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
500 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
501 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
502 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
503 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
504 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
505 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
506 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
507 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
508 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
509 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
510 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
511 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
512 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
513 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
514 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
515 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
516
517 # Dash webm audio
518 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
519 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
520
521 # Dash webm audio with opus inside
522 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
523 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
524 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
525
526 # RTMP (unnamed)
527 '_rtmp': {'protocol': 'rtmp'},
528
529 # av01 video only formats sometimes served with "unknown" codecs
530 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
531 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
532 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
533 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
534 }
535 _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
536
537 _GEO_BYPASS = False
538
539 IE_NAME = 'youtube'
540 _TESTS = [
541 {
542 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
543 'info_dict': {
544 'id': 'BaW_jenozKc',
545 'ext': 'mp4',
546 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
547 'uploader': 'Philipp Hagemeister',
548 'uploader_id': 'phihag',
549 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
550 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
551 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
552 'upload_date': '20121002',
553 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
554 'categories': ['Science & Technology'],
555 'tags': ['youtube-dl'],
556 'duration': 10,
557 'view_count': int,
558 'like_count': int,
559 'dislike_count': int,
560 'start_time': 1,
561 'end_time': 9,
562 }
563 },
564 {
565 'url': 'https://www.youtube.com/watch?v=UxxajLWwzqY',
566 'note': 'Test generic use_cipher_signature video (#897)',
567 'info_dict': {
568 'id': 'UxxajLWwzqY',
569 'ext': 'mp4',
570 'upload_date': '20120506',
571 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
572 'alt_title': 'I Love It (feat. Charli XCX)',
573 'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8',
574 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
575 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
576 'iconic ep', 'iconic', 'love', 'it'],
577 'duration': 180,
578 'uploader': 'Icona Pop',
579 'uploader_id': 'IconaPop',
580 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop',
581 'creator': 'Icona Pop',
582 'track': 'I Love It (feat. Charli XCX)',
583 'artist': 'Icona Pop',
584 }
585 },
586 {
587 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
588 'note': 'Test VEVO video with age protection (#956)',
589 'info_dict': {
590 'id': '07FYdnEawAQ',
591 'ext': 'mp4',
592 'upload_date': '20130703',
593 'title': 'Justin Timberlake - Tunnel Vision (Official Music Video) (Explicit)',
594 'alt_title': 'Tunnel Vision',
595 'description': 'md5:07dab3356cde4199048e4c7cd93471e1',
596 'duration': 419,
597 'uploader': 'justintimberlakeVEVO',
598 'uploader_id': 'justintimberlakeVEVO',
599 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',
600 'creator': 'Justin Timberlake',
601 'track': 'Tunnel Vision',
602 'artist': 'Justin Timberlake',
603 'age_limit': 18,
604 }
605 },
606 {
607 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
608 'note': 'Embed-only video (#1746)',
609 'info_dict': {
610 'id': 'yZIXLfi8CZQ',
611 'ext': 'mp4',
612 'upload_date': '20120608',
613 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
614 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
615 'uploader': 'SET India',
616 'uploader_id': 'setindia',
617 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
618 'age_limit': 18,
619 }
620 },
621 {
622 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
623 'note': 'Use the first video ID in the URL',
624 'info_dict': {
625 'id': 'BaW_jenozKc',
626 'ext': 'mp4',
627 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
628 'uploader': 'Philipp Hagemeister',
629 'uploader_id': 'phihag',
630 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
631 'upload_date': '20121002',
632 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
633 'categories': ['Science & Technology'],
634 'tags': ['youtube-dl'],
635 'duration': 10,
636 'view_count': int,
637 'like_count': int,
638 'dislike_count': int,
639 },
640 'params': {
641 'skip_download': True,
642 },
643 },
644 {
645 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
646 'note': '256k DASH audio (format 141) via DASH manifest',
647 'info_dict': {
648 'id': 'a9LDPn-MO4I',
649 'ext': 'm4a',
650 'upload_date': '20121002',
651 'uploader_id': '8KVIDEO',
652 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
653 'description': '',
654 'uploader': '8KVIDEO',
655 'title': 'UHDTV TEST 8K VIDEO.mp4'
656 },
657 'params': {
658 'youtube_include_dash_manifest': True,
659 'format': '141',
660 },
661 'skip': 'format 141 not served anymore',
662 },
663 # DASH manifest with encrypted signature
664 {
665 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
666 'info_dict': {
667 'id': 'IB3lcPjvWLA',
668 'ext': 'm4a',
669 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
670 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
671 'duration': 244,
672 'uploader': 'AfrojackVEVO',
673 'uploader_id': 'AfrojackVEVO',
674 'upload_date': '20131011',
675 },
676 'params': {
677 'youtube_include_dash_manifest': True,
678 'format': '141/bestaudio[ext=m4a]',
679 },
680 },
681 # JS player signature function name containing $
682 {
683 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
684 'info_dict': {
685 'id': 'nfWlot6h_JM',
686 'ext': 'm4a',
687 'title': 'Taylor Swift - Shake It Off',
688 'description': 'md5:bec2185232c05479482cb5a9b82719bf',
689 'duration': 242,
690 'uploader': 'TaylorSwiftVEVO',
691 'uploader_id': 'TaylorSwiftVEVO',
692 'upload_date': '20140818',
693 'creator': 'Taylor Swift',
694 },
695 'params': {
696 'youtube_include_dash_manifest': True,
697 'format': '141/bestaudio[ext=m4a]',
698 },
699 },
700 # Controversy video
701 {
702 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
703 'info_dict': {
704 'id': 'T4XJQO3qol8',
705 'ext': 'mp4',
706 'duration': 219,
707 'upload_date': '20100909',
708 'uploader': 'Amazing Atheist',
709 'uploader_id': 'TheAmazingAtheist',
710 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
711 'title': 'Burning Everyone\'s Koran',
712 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
713 }
714 },
715 # Normal age-gate video (No vevo, embed allowed)
716 {
717 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
718 'info_dict': {
719 'id': 'HtVdAasjOgU',
720 'ext': 'mp4',
721 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
722 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
723 'duration': 142,
724 'uploader': 'The Witcher',
725 'uploader_id': 'WitcherGame',
726 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
727 'upload_date': '20140605',
728 'age_limit': 18,
729 },
730 },
731 # Age-gate video with encrypted signature
732 {
733 'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU',
734 'info_dict': {
735 'id': '6kLq3WMV1nU',
736 'ext': 'mp4',
737 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
738 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
739 'duration': 246,
740 'uploader': 'LloydVEVO',
741 'uploader_id': 'LloydVEVO',
742 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO',
743 'upload_date': '20110629',
744 'age_limit': 18,
745 },
746 },
747 # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
748 # YouTube Red ad is not captured for creator
749 {
750 'url': '__2ABJjxzNo',
751 'info_dict': {
752 'id': '__2ABJjxzNo',
753 'ext': 'mp4',
754 'duration': 266,
755 'upload_date': '20100430',
756 'uploader_id': 'deadmau5',
757 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
758 'creator': 'deadmau5',
759 'description': 'md5:12c56784b8032162bb936a5f76d55360',
760 'uploader': 'deadmau5',
761 'title': 'Deadmau5 - Some Chords (HD)',
762 'alt_title': 'Some Chords',
763 },
764 'expected_warnings': [
765 'DASH manifest missing',
766 ]
767 },
768 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
769 {
770 'url': 'lqQg6PlCWgI',
771 'info_dict': {
772 'id': 'lqQg6PlCWgI',
773 'ext': 'mp4',
774 'duration': 6085,
775 'upload_date': '20150827',
776 'uploader_id': 'olympic',
777 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
778 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
779 'uploader': 'Olympic',
780 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
781 },
782 'params': {
783 'skip_download': 'requires avconv',
784 }
785 },
786 # Non-square pixels
787 {
788 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
789 'info_dict': {
790 'id': '_b-2C3KPAM0',
791 'ext': 'mp4',
792 'stretched_ratio': 16 / 9.,
793 'duration': 85,
794 'upload_date': '20110310',
795 'uploader_id': 'AllenMeow',
796 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
797 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
798 'uploader': '孫ᄋᄅ',
799 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
800 },
801 },
802 # url_encoded_fmt_stream_map is empty string
803 {
804 'url': 'qEJwOuvDf7I',
805 'info_dict': {
806 'id': 'qEJwOuvDf7I',
807 'ext': 'webm',
808 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
809 'description': '',
810 'upload_date': '20150404',
811 'uploader_id': 'spbelect',
812 'uploader': 'Наблюдатели Петербурга',
813 },
814 'params': {
815 'skip_download': 'requires avconv',
816 },
817 'skip': 'This live event has ended.',
818 },
819 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
820 {
821 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
822 'info_dict': {
823 'id': 'FIl7x6_3R5Y',
824 'ext': 'webm',
825 'title': 'md5:7b81415841e02ecd4313668cde88737a',
826 'description': 'md5:116377fd2963b81ec4ce64b542173306',
827 'duration': 220,
828 'upload_date': '20150625',
829 'uploader_id': 'dorappi2000',
830 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
831 'uploader': 'dorappi2000',
832 'formats': 'mincount:31',
833 },
834 'skip': 'not actual anymore',
835 },
836 # DASH manifest with segment_list
837 {
838 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
839 'md5': '8ce563a1d667b599d21064e982ab9e31',
840 'info_dict': {
841 'id': 'CsmdDsKjzN8',
842 'ext': 'mp4',
843 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
844 'uploader': 'Airtek',
845 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
846 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
847 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
848 },
849 'params': {
850 'youtube_include_dash_manifest': True,
851 'format': '135', # bestvideo
852 },
853 'skip': 'This live event has ended.',
854 },
855 {
856 # Multifeed videos (multiple cameras), URL is for Main Camera
857 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
858 'info_dict': {
859 'id': 'jqWvoWXjCVs',
860 'title': 'teamPGP: Rocket League Noob Stream',
861 'description': 'md5:dc7872fb300e143831327f1bae3af010',
862 },
863 'playlist': [{
864 'info_dict': {
865 'id': 'jqWvoWXjCVs',
866 'ext': 'mp4',
867 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
868 'description': 'md5:dc7872fb300e143831327f1bae3af010',
869 'duration': 7335,
870 'upload_date': '20150721',
871 'uploader': 'Beer Games Beer',
872 'uploader_id': 'beergamesbeer',
873 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
874 'license': 'Standard YouTube License',
875 },
876 }, {
877 'info_dict': {
878 'id': '6h8e8xoXJzg',
879 'ext': 'mp4',
880 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
881 'description': 'md5:dc7872fb300e143831327f1bae3af010',
882 'duration': 7337,
883 'upload_date': '20150721',
884 'uploader': 'Beer Games Beer',
885 'uploader_id': 'beergamesbeer',
886 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
887 'license': 'Standard YouTube License',
888 },
889 }, {
890 'info_dict': {
891 'id': 'PUOgX5z9xZw',
892 'ext': 'mp4',
893 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
894 'description': 'md5:dc7872fb300e143831327f1bae3af010',
895 'duration': 7337,
896 'upload_date': '20150721',
897 'uploader': 'Beer Games Beer',
898 'uploader_id': 'beergamesbeer',
899 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
900 'license': 'Standard YouTube License',
901 },
902 }, {
903 'info_dict': {
904 'id': 'teuwxikvS5k',
905 'ext': 'mp4',
906 'title': 'teamPGP: Rocket League Noob Stream (zim)',
907 'description': 'md5:dc7872fb300e143831327f1bae3af010',
908 'duration': 7334,
909 'upload_date': '20150721',
910 'uploader': 'Beer Games Beer',
911 'uploader_id': 'beergamesbeer',
912 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
913 'license': 'Standard YouTube License',
914 },
915 }],
916 'params': {
917 'skip_download': True,
918 },
919 'skip': 'This video is not available.',
920 },
921 {
922 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
923 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
924 'info_dict': {
925 'id': 'gVfLd0zydlo',
926 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
927 },
928 'playlist_count': 2,
929 'skip': 'Not multifeed anymore',
930 },
931 {
932 'url': 'https://vid.plus/FlRa-iH7PGw',
933 'only_matching': True,
934 },
935 {
936 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
937 'only_matching': True,
938 },
939 {
940 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
941 # Also tests cut-off URL expansion in video description (see
942 # https://github.com/ytdl-org/youtube-dl/issues/1892,
943 # https://github.com/ytdl-org/youtube-dl/issues/8164)
944 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
945 'info_dict': {
946 'id': 'lsguqyKfVQg',
947 'ext': 'mp4',
948 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
949 'alt_title': 'Dark Walk - Position Music',
950 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
951 'duration': 133,
952 'upload_date': '20151119',
953 'uploader_id': 'IronSoulElf',
954 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
955 'uploader': 'IronSoulElf',
956 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
957 'track': 'Dark Walk - Position Music',
958 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
959 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
960 },
961 'params': {
962 'skip_download': True,
963 },
964 },
965 {
966 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
967 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
968 'only_matching': True,
969 },
970 {
971 # Video with yt:stretch=17:0
972 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
973 'info_dict': {
974 'id': 'Q39EVAstoRM',
975 'ext': 'mp4',
976 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
977 'description': 'md5:ee18a25c350637c8faff806845bddee9',
978 'upload_date': '20151107',
979 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
980 'uploader': 'CH GAMER DROID',
981 },
982 'params': {
983 'skip_download': True,
984 },
985 'skip': 'This video does not exist.',
986 },
987 {
988 # Video licensed under Creative Commons
989 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
990 'info_dict': {
991 'id': 'M4gD1WSo5mA',
992 'ext': 'mp4',
993 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
994 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
995 'duration': 721,
996 'upload_date': '20150127',
997 'uploader_id': 'BerkmanCenter',
998 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
999 'uploader': 'The Berkman Klein Center for Internet & Society',
1000 'license': 'Creative Commons Attribution license (reuse allowed)',
1001 },
1002 'params': {
1003 'skip_download': True,
1004 },
1005 },
1006 {
1007 # Channel-like uploader_url
1008 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
1009 'info_dict': {
1010 'id': 'eQcmzGIKrzg',
1011 'ext': 'mp4',
1012 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
1013 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
1014 'duration': 4060,
1015 'upload_date': '20151119',
1016 'uploader': 'Bernie Sanders',
1017 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
1018 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
1019 'license': 'Creative Commons Attribution license (reuse allowed)',
1020 },
1021 'params': {
1022 'skip_download': True,
1023 },
1024 },
1025 {
1026 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
1027 'only_matching': True,
1028 },
1029 {
1030 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
1031 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
1032 'only_matching': True,
1033 },
1034 {
1035 # Rental video preview
1036 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
1037 'info_dict': {
1038 'id': 'uGpuVWrhIzE',
1039 'ext': 'mp4',
1040 'title': 'Piku - Trailer',
1041 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
1042 'upload_date': '20150811',
1043 'uploader': 'FlixMatrix',
1044 'uploader_id': 'FlixMatrixKaravan',
1045 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
1046 'license': 'Standard YouTube License',
1047 },
1048 'params': {
1049 'skip_download': True,
1050 },
1051 'skip': 'This video is not available.',
1052 },
1053 {
1054 # YouTube Red video with episode data
1055 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
1056 'info_dict': {
1057 'id': 'iqKdEhx-dD4',
1058 'ext': 'mp4',
1059 'title': 'Isolation - Mind Field (Ep 1)',
1060 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
1061 'duration': 2085,
1062 'upload_date': '20170118',
1063 'uploader': 'Vsauce',
1064 'uploader_id': 'Vsauce',
1065 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
1066 'series': 'Mind Field',
1067 'season_number': 1,
1068 'episode_number': 1,
1069 },
1070 'params': {
1071 'skip_download': True,
1072 },
1073 'expected_warnings': [
1074 'Skipping DASH manifest',
1075 ],
1076 },
1077 {
1078 # The following content has been identified by the YouTube community
1079 # as inappropriate or offensive to some audiences.
1080 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
1081 'info_dict': {
1082 'id': '6SJNVb0GnPI',
1083 'ext': 'mp4',
1084 'title': 'Race Differences in Intelligence',
1085 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
1086 'duration': 965,
1087 'upload_date': '20140124',
1088 'uploader': 'New Century Foundation',
1089 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
1090 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
1091 },
1092 'params': {
1093 'skip_download': True,
1094 },
1095 },
1096 {
1097 # itag 212
1098 'url': '1t24XAntNCY',
1099 'only_matching': True,
1100 },
1101 {
1102 # geo restricted to JP
1103 'url': 'sJL6WA-aGkQ',
1104 'only_matching': True,
1105 },
1106 {
1107 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
1108 'only_matching': True,
1109 },
1110 {
1111 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1112 'only_matching': True,
1113 },
1114 {
1115 # DRM protected
1116 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1117 'only_matching': True,
1118 },
1119 {
1120 # Video with unsupported adaptive stream type formats
1121 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1122 'info_dict': {
1123 'id': 'Z4Vy8R84T1U',
1124 'ext': 'mp4',
1125 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1126 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1127 'duration': 433,
1128 'upload_date': '20130923',
1129 'uploader': 'Amelia Putri Harwita',
1130 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1131 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1132 'formats': 'maxcount:10',
1133 },
1134 'params': {
1135 'skip_download': True,
1136 'youtube_include_dash_manifest': False,
1137 },
1138 },
1139 {
1140 # Youtube Music Auto-generated description
1141 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1142 'info_dict': {
1143 'id': 'MgNrAu2pzNs',
1144 'ext': 'mp4',
1145 'title': 'Voyeur Girl',
1146 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1147 'upload_date': '20190312',
1148 'uploader': 'Various Artists - Topic',
1149 'uploader_id': 'UCVWKBi1ELZn0QX2CBLSkiyw',
1150 'artist': 'Stephen',
1151 'track': 'Voyeur Girl',
1152 'album': 'it\'s too much love to know my dear',
1153 'release_date': '20190313',
1154 'release_year': 2019,
1155 },
1156 'params': {
1157 'skip_download': True,
1158 },
1159 },
1160 {
1161 # Youtube Music Auto-generated description
1162 # Retrieve 'artist' field from 'Artist:' in video description
1163 # when it is present on youtube music video
1164 'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY',
1165 'info_dict': {
1166 'id': 'k0jLE7tTwjY',
1167 'ext': 'mp4',
1168 'title': 'Latch Feat. Sam Smith',
1169 'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335',
1170 'upload_date': '20150110',
1171 'uploader': 'Various Artists - Topic',
1172 'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w',
1173 'artist': 'Disclosure',
1174 'track': 'Latch Feat. Sam Smith',
1175 'album': 'Latch Featuring Sam Smith',
1176 'release_date': '20121008',
1177 'release_year': 2012,
1178 },
1179 'params': {
1180 'skip_download': True,
1181 },
1182 },
1183 {
1184 # Youtube Music Auto-generated description
1185 # handle multiple artists on youtube music video
1186 'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA',
1187 'info_dict': {
1188 'id': '74qn0eJSjpA',
1189 'ext': 'mp4',
1190 'title': 'Eastside',
1191 'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2',
1192 'upload_date': '20180710',
1193 'uploader': 'Benny Blanco - Topic',
1194 'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A',
1195 'artist': 'benny blanco, Halsey, Khalid',
1196 'track': 'Eastside',
1197 'album': 'Eastside',
1198 'release_date': '20180713',
1199 'release_year': 2018,
1200 },
1201 'params': {
1202 'skip_download': True,
1203 },
1204 },
1205 {
1206 # Youtube Music Auto-generated description
1207 # handle youtube music video with release_year and no release_date
1208 'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M',
1209 'info_dict': {
1210 'id': '-hcAI0g-f5M',
1211 'ext': 'mp4',
1212 'title': 'Put It On Me',
1213 'description': 'md5:93c55acc682ae7b0c668f2e34e1c069e',
1214 'upload_date': '20180426',
1215 'uploader': 'Matt Maeson - Topic',
1216 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ',
1217 'artist': 'Matt Maeson',
1218 'track': 'Put It On Me',
1219 'album': 'The Hearse',
1220 'release_date': None,
1221 'release_year': 2018,
1222 },
1223 'params': {
1224 'skip_download': True,
1225 },
1226 },
1227 {
1228 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1229 'only_matching': True,
1230 },
1231 ]
1232
1233 def __init__(self, *args, **kwargs):
1234 super(YoutubeIE, self).__init__(*args, **kwargs)
1235 self._player_cache = {}
1236
1237 def report_video_info_webpage_download(self, video_id):
1238 """Report attempt to download video info webpage."""
1239 self.to_screen('%s: Downloading video info webpage' % video_id)
1240
1241 def report_information_extraction(self, video_id):
1242 """Report attempt to extract video information."""
1243 self.to_screen('%s: Extracting video information' % video_id)
1244
1245 def report_unavailable_format(self, video_id, format):
1246 """Report extracted video URL."""
1247 self.to_screen('%s: Format %s not available' % (video_id, format))
1248
1249 def report_rtmp_download(self):
1250 """Indicate the download will use the RTMP protocol."""
1251 self.to_screen('RTMP download detected')
1252
1253 def _signature_cache_id(self, example_sig):
1254 """ Return a string representation of a signature """
1255 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
1256
1257 def _extract_signature_function(self, video_id, player_url, example_sig):
1258 id_m = re.match(
1259 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2,3}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$',
1260 player_url)
1261 if not id_m:
1262 raise ExtractorError('Cannot identify player %r' % player_url)
1263 player_type = id_m.group('ext')
1264 player_id = id_m.group('id')
1265
1266 # Read from filesystem cache
1267 func_id = '%s_%s_%s' % (
1268 player_type, player_id, self._signature_cache_id(example_sig))
1269 assert os.path.basename(func_id) == func_id
1270
1271 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
1272 if cache_spec is not None:
1273 return lambda s: ''.join(s[i] for i in cache_spec)
1274
1275 download_note = (
1276 'Downloading player %s' % player_url
1277 if self._downloader.params.get('verbose') else
1278 'Downloading %s player %s' % (player_type, player_id)
1279 )
1280 if player_type == 'js':
1281 code = self._download_webpage(
1282 player_url, video_id,
1283 note=download_note,
1284 errnote='Download of %s failed' % player_url)
1285 res = self._parse_sig_js(code)
1286 elif player_type == 'swf':
1287 urlh = self._request_webpage(
1288 player_url, video_id,
1289 note=download_note,
1290 errnote='Download of %s failed' % player_url)
1291 code = urlh.read()
1292 res = self._parse_sig_swf(code)
1293 else:
1294 assert False, 'Invalid player type %r' % player_type
1295
1296 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1297 cache_res = res(test_string)
1298 cache_spec = [ord(c) for c in cache_res]
1299
1300 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
1301 return res
1302
1303 def _print_sig_code(self, func, example_sig):
1304 def gen_sig_code(idxs):
1305 def _genslice(start, end, step):
1306 starts = '' if start == 0 else str(start)
1307 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
1308 steps = '' if step == 1 else (':%d' % step)
1309 return 's[%s%s%s]' % (starts, ends, steps)
1310
1311 step = None
1312 # Quelch pyflakes warnings - start will be set when step is set
1313 start = '(Never used)'
1314 for i, prev in zip(idxs[1:], idxs[:-1]):
1315 if step is not None:
1316 if i - prev == step:
1317 continue
1318 yield _genslice(start, prev, step)
1319 step = None
1320 continue
1321 if i - prev in [-1, 1]:
1322 step = i - prev
1323 start = prev
1324 continue
1325 else:
1326 yield 's[%d]' % prev
1327 if step is None:
1328 yield 's[%d]' % i
1329 else:
1330 yield _genslice(start, i, step)
1331
1332 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1333 cache_res = func(test_string)
1334 cache_spec = [ord(c) for c in cache_res]
1335 expr_code = ' + '.join(gen_sig_code(cache_spec))
1336 signature_id_tuple = '(%s)' % (
1337 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
1338 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
1339 ' return %s\n') % (signature_id_tuple, expr_code)
1340 self.to_screen('Extracted signature function:\n' + code)
1341
1342 def _parse_sig_js(self, jscode):
1343 funcname = self._search_regex(
1344 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1345 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1346 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1347 # Obsolete patterns
1348 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1349 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
1350 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1351 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1352 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1353 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1354 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1355 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
1356 jscode, 'Initial JS player signature function name', group='sig')
1357
1358 jsi = JSInterpreter(jscode)
1359 initial_function = jsi.extract_function(funcname)
1360 return lambda s: initial_function([s])
1361
1362 def _parse_sig_swf(self, file_contents):
1363 swfi = SWFInterpreter(file_contents)
1364 TARGET_CLASSNAME = 'SignatureDecipher'
1365 searched_class = swfi.extract_class(TARGET_CLASSNAME)
1366 initial_function = swfi.extract_function(searched_class, 'decipher')
1367 return lambda s: initial_function([s])
1368
1369 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1370 """Turn the encrypted s field into a working signature"""
1371
1372 if player_url is None:
1373 raise ExtractorError('Cannot decrypt signature without player_url')
1374
1375 if player_url.startswith('//'):
1376 player_url = 'https:' + player_url
1377 elif not re.match(r'https?://', player_url):
1378 player_url = compat_urlparse.urljoin(
1379 'https://www.youtube.com', player_url)
1380 try:
1381 player_id = (player_url, self._signature_cache_id(s))
1382 if player_id not in self._player_cache:
1383 func = self._extract_signature_function(
1384 video_id, player_url, s
1385 )
1386 self._player_cache[player_id] = func
1387 func = self._player_cache[player_id]
1388 if self._downloader.params.get('youtube_print_sig_code'):
1389 self._print_sig_code(func, s)
1390 return func(s)
1391 except Exception as e:
1392 tb = traceback.format_exc()
1393 raise ExtractorError(
1394 'Signature extraction failed: ' + tb, cause=e)
1395
1396 def _get_subtitles(self, video_id, webpage):
1397 try:
1398 subs_doc = self._download_xml(
1399 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1400 video_id, note=False)
1401 except ExtractorError as err:
1402 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
1403 return {}
1404
1405 sub_lang_list = {}
1406 for track in subs_doc.findall('track'):
1407 lang = track.attrib['lang_code']
1408 if lang in sub_lang_list:
1409 continue
1410 sub_formats = []
1411 for ext in self._SUBTITLE_FORMATS:
1412 params = compat_urllib_parse_urlencode({
1413 'lang': lang,
1414 'v': video_id,
1415 'fmt': ext,
1416 'name': track.attrib['name'].encode('utf-8'),
1417 })
1418 sub_formats.append({
1419 'url': 'https://www.youtube.com/api/timedtext?' + params,
1420 'ext': ext,
1421 })
1422 sub_lang_list[lang] = sub_formats
1423 if not sub_lang_list:
1424 self._downloader.report_warning('video doesn\'t have subtitles')
1425 return {}
1426 return sub_lang_list
1427
1428 def _get_ytplayer_config(self, video_id, webpage):
1429 patterns = (
1430 # User data may contain arbitrary character sequences that may affect
1431 # JSON extraction with regex, e.g. when '};' is contained the second
1432 # regex won't capture the whole JSON. Yet working around by trying more
1433 # concrete regex first keeping in mind proper quoted string handling
1434 # to be implemented in future that will replace this workaround (see
1435 # https://github.com/ytdl-org/youtube-dl/issues/7468,
1436 # https://github.com/ytdl-org/youtube-dl/pull/7599)
1437 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1438 r';ytplayer\.config\s*=\s*({.+?});',
1439 )
1440 config = self._search_regex(
1441 patterns, webpage, 'ytplayer.config', default=None)
1442 if config:
1443 return self._parse_json(
1444 uppercase_escape(config), video_id, fatal=False)
1445
1446 def _get_automatic_captions(self, video_id, webpage):
1447 """We need the webpage for getting the captions url, pass it as an
1448 argument to speed up the process."""
1449 self.to_screen('%s: Looking for automatic captions' % video_id)
1450 player_config = self._get_ytplayer_config(video_id, webpage)
1451 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
1452 if not player_config:
1453 self._downloader.report_warning(err_msg)
1454 return {}
1455 try:
1456 args = player_config['args']
1457 caption_url = args.get('ttsurl')
1458 if caption_url:
1459 timestamp = args['timestamp']
1460 # We get the available subtitles
1461 list_params = compat_urllib_parse_urlencode({
1462 'type': 'list',
1463 'tlangs': 1,
1464 'asrs': 1,
1465 })
1466 list_url = caption_url + '&' + list_params
1467 caption_list = self._download_xml(list_url, video_id)
1468 original_lang_node = caption_list.find('track')
1469 if original_lang_node is None:
1470 self._downloader.report_warning('Video doesn\'t have automatic captions')
1471 return {}
1472 original_lang = original_lang_node.attrib['lang_code']
1473 caption_kind = original_lang_node.attrib.get('kind', '')
1474
1475 sub_lang_list = {}
1476 for lang_node in caption_list.findall('target'):
1477 sub_lang = lang_node.attrib['lang_code']
1478 sub_formats = []
1479 for ext in self._SUBTITLE_FORMATS:
1480 params = compat_urllib_parse_urlencode({
1481 'lang': original_lang,
1482 'tlang': sub_lang,
1483 'fmt': ext,
1484 'ts': timestamp,
1485 'kind': caption_kind,
1486 })
1487 sub_formats.append({
1488 'url': caption_url + '&' + params,
1489 'ext': ext,
1490 })
1491 sub_lang_list[sub_lang] = sub_formats
1492 return sub_lang_list
1493
1494 def make_captions(sub_url, sub_langs):
1495 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1496 caption_qs = compat_parse_qs(parsed_sub_url.query)
1497 captions = {}
1498 for sub_lang in sub_langs:
1499 sub_formats = []
1500 for ext in self._SUBTITLE_FORMATS:
1501 caption_qs.update({
1502 'tlang': [sub_lang],
1503 'fmt': [ext],
1504 })
1505 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1506 query=compat_urllib_parse_urlencode(caption_qs, True)))
1507 sub_formats.append({
1508 'url': sub_url,
1509 'ext': ext,
1510 })
1511 captions[sub_lang] = sub_formats
1512 return captions
1513
1514 # New captions format as of 22.06.2017
1515 player_response = args.get('player_response')
1516 if player_response and isinstance(player_response, compat_str):
1517 player_response = self._parse_json(
1518 player_response, video_id, fatal=False)
1519 if player_response:
1520 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1521 base_url = renderer['captionTracks'][0]['baseUrl']
1522 sub_lang_list = []
1523 for lang in renderer['translationLanguages']:
1524 lang_code = lang.get('languageCode')
1525 if lang_code:
1526 sub_lang_list.append(lang_code)
1527 return make_captions(base_url, sub_lang_list)
1528
1529 # Some videos don't provide ttsurl but rather caption_tracks and
1530 # caption_translation_languages (e.g. 20LmZk1hakA)
1531 # Does not used anymore as of 22.06.2017
1532 caption_tracks = args['caption_tracks']
1533 caption_translation_languages = args['caption_translation_languages']
1534 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
1535 sub_lang_list = []
1536 for lang in caption_translation_languages.split(','):
1537 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1538 sub_lang = lang_qs.get('lc', [None])[0]
1539 if sub_lang:
1540 sub_lang_list.append(sub_lang)
1541 return make_captions(caption_url, sub_lang_list)
1542 # An extractor error can be raise by the download process if there are
1543 # no automatic captions but there are subtitles
1544 except (KeyError, IndexError, ExtractorError):
1545 self._downloader.report_warning(err_msg)
1546 return {}
1547
1548 def _mark_watched(self, video_id, video_info, player_response):
1549 playback_url = url_or_none(try_get(
1550 player_response,
1551 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
1552 video_info, lambda x: x['videostats_playback_base_url'][0]))
1553 if not playback_url:
1554 return
1555 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1556 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1557
1558 # cpn generation algorithm is reverse engineered from base.js.
1559 # In fact it works even with dummy cpn.
1560 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1561 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1562
1563 qs.update({
1564 'ver': ['2'],
1565 'cpn': [cpn],
1566 })
1567 playback_url = compat_urlparse.urlunparse(
1568 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
1569
1570 self._download_webpage(
1571 playback_url, video_id, 'Marking watched',
1572 'Unable to mark watched', fatal=False)
1573
1574 @staticmethod
1575 def _extract_urls(webpage):
1576 # Embedded YouTube player
1577 entries = [
1578 unescapeHTML(mobj.group('url'))
1579 for mobj in re.finditer(r'''(?x)
1580 (?:
1581 <iframe[^>]+?src=|
1582 data-video-url=|
1583 <embed[^>]+?src=|
1584 embedSWF\(?:\s*|
1585 <object[^>]+data=|
1586 new\s+SWFObject\(
1587 )
1588 (["\'])
1589 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1590 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
1591 \1''', webpage)]
1592
1593 # lazyYT YouTube embed
1594 entries.extend(list(map(
1595 unescapeHTML,
1596 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1597
1598 # Wordpress "YouTube Video Importer" plugin
1599 matches = re.findall(r'''(?x)<div[^>]+
1600 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1601 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1602 entries.extend(m[-1] for m in matches)
1603
1604 return entries
1605
1606 @staticmethod
1607 def _extract_url(webpage):
1608 urls = YoutubeIE._extract_urls(webpage)
1609 return urls[0] if urls else None
1610
1611 @classmethod
1612 def extract_id(cls, url):
1613 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1614 if mobj is None:
1615 raise ExtractorError('Invalid URL: %s' % url)
1616 video_id = mobj.group(2)
1617 return video_id
1618
1619 @staticmethod
1620 def _extract_chapters(description, duration):
1621 if not description:
1622 return None
1623 chapter_lines = re.findall(
1624 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1625 description)
1626 if not chapter_lines:
1627 return None
1628 chapters = []
1629 for next_num, (chapter_line, time_point) in enumerate(
1630 chapter_lines, start=1):
1631 start_time = parse_duration(time_point)
1632 if start_time is None:
1633 continue
1634 if start_time > duration:
1635 break
1636 end_time = (duration if next_num == len(chapter_lines)
1637 else parse_duration(chapter_lines[next_num][1]))
1638 if end_time is None:
1639 continue
1640 if end_time > duration:
1641 end_time = duration
1642 if start_time > end_time:
1643 break
1644 chapter_title = re.sub(
1645 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1646 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1647 chapters.append({
1648 'start_time': start_time,
1649 'end_time': end_time,
1650 'title': chapter_title,
1651 })
1652 return chapters
1653
1654 def _real_extract(self, url):
1655 url, smuggled_data = unsmuggle_url(url, {})
1656
1657 proto = (
1658 'http' if self._downloader.params.get('prefer_insecure', False)
1659 else 'https')
1660
1661 start_time = None
1662 end_time = None
1663 parsed_url = compat_urllib_parse_urlparse(url)
1664 for component in [parsed_url.fragment, parsed_url.query]:
1665 query = compat_parse_qs(component)
1666 if start_time is None and 't' in query:
1667 start_time = parse_duration(query['t'][0])
1668 if start_time is None and 'start' in query:
1669 start_time = parse_duration(query['start'][0])
1670 if end_time is None and 'end' in query:
1671 end_time = parse_duration(query['end'][0])
1672
1673 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1674 mobj = re.search(self._NEXT_URL_RE, url)
1675 if mobj:
1676 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
1677 video_id = self.extract_id(url)
1678
1679 # Get video webpage
1680 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
1681 video_webpage = self._download_webpage(url, video_id)
1682
1683 # Attempt to extract SWF player URL
1684 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1685 if mobj is not None:
1686 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1687 else:
1688 player_url = None
1689
1690 dash_mpds = []
1691
1692 def add_dash_mpd(video_info):
1693 dash_mpd = video_info.get('dashmpd')
1694 if dash_mpd and dash_mpd[0] not in dash_mpds:
1695 dash_mpds.append(dash_mpd[0])
1696
1697 def add_dash_mpd_pr(pl_response):
1698 dash_mpd = url_or_none(try_get(
1699 pl_response, lambda x: x['streamingData']['dashManifestUrl'],
1700 compat_str))
1701 if dash_mpd and dash_mpd not in dash_mpds:
1702 dash_mpds.append(dash_mpd)
1703
1704 is_live = None
1705 view_count = None
1706
1707 def extract_view_count(v_info):
1708 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1709
1710 def extract_token(v_info):
1711 return dict_get(v_info, ('account_playback_token', 'accountPlaybackToken', 'token'))
1712
1713 def extract_player_response(player_response, video_id):
1714 pl_response = str_or_none(player_response)
1715 if not pl_response:
1716 return
1717 pl_response = self._parse_json(pl_response, video_id, fatal=False)
1718 if isinstance(pl_response, dict):
1719 add_dash_mpd_pr(pl_response)
1720 return pl_response
1721
1722 player_response = {}
1723
1724 # Get video info
1725 embed_webpage = None
1726 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1727 age_gate = True
1728 # We simulate the access to the video from www.youtube.com/v/{video_id}
1729 # this can be viewed without login into Youtube
1730 url = proto + '://www.youtube.com/embed/%s' % video_id
1731 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
1732 data = compat_urllib_parse_urlencode({
1733 'video_id': video_id,
1734 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1735 'sts': self._search_regex(
1736 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1737 })
1738 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1739 video_info_webpage = self._download_webpage(
1740 video_info_url, video_id,
1741 note='Refetching age-gated info webpage',
1742 errnote='unable to download video info webpage')
1743 video_info = compat_parse_qs(video_info_webpage)
1744 pl_response = video_info.get('player_response', [None])[0]
1745 player_response = extract_player_response(pl_response, video_id)
1746 add_dash_mpd(video_info)
1747 view_count = extract_view_count(video_info)
1748 else:
1749 age_gate = False
1750 video_info = None
1751 sts = None
1752 # Try looking directly into the video webpage
1753 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1754 if ytplayer_config:
1755 args = ytplayer_config['args']
1756 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1757 # Convert to the same format returned by compat_parse_qs
1758 video_info = dict((k, [v]) for k, v in args.items())
1759 add_dash_mpd(video_info)
1760 # Rental video is not rented but preview is available (e.g.
1761 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1762 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1763 if not video_info and args.get('ypc_vid'):
1764 return self.url_result(
1765 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1766 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1767 is_live = True
1768 sts = ytplayer_config.get('sts')
1769 if not player_response:
1770 player_response = extract_player_response(args.get('player_response'), video_id)
1771 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1772 add_dash_mpd_pr(player_response)
1773 # We also try looking in get_video_info since it may contain different dashmpd
1774 # URL that points to a DASH manifest with possibly different itag set (some itags
1775 # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
1776 # manifest pointed by get_video_info's dashmpd).
1777 # The general idea is to take a union of itags of both DASH manifests (for example
1778 # video with such 'manifest behavior' see https://github.com/ytdl-org/youtube-dl/issues/6093)
1779 self.report_video_info_webpage_download(video_id)
1780 for el in ('embedded', 'detailpage', 'vevo', ''):
1781 query = {
1782 'video_id': video_id,
1783 'ps': 'default',
1784 'eurl': '',
1785 'gl': 'US',
1786 'hl': 'en',
1787 }
1788 if el:
1789 query['el'] = el
1790 if sts:
1791 query['sts'] = sts
1792 video_info_webpage = self._download_webpage(
1793 '%s://www.youtube.com/get_video_info' % proto,
1794 video_id, note=False,
1795 errnote='unable to download video info webpage',
1796 fatal=False, query=query)
1797 if not video_info_webpage:
1798 continue
1799 get_video_info = compat_parse_qs(video_info_webpage)
1800 if not player_response:
1801 pl_response = get_video_info.get('player_response', [None])[0]
1802 player_response = extract_player_response(pl_response, video_id)
1803 add_dash_mpd(get_video_info)
1804 if view_count is None:
1805 view_count = extract_view_count(get_video_info)
1806 if not video_info:
1807 video_info = get_video_info
1808 get_token = extract_token(get_video_info)
1809 if get_token:
1810 # Different get_video_info requests may report different results, e.g.
1811 # some may report video unavailability, but some may serve it without
1812 # any complaint (see https://github.com/ytdl-org/youtube-dl/issues/7362,
1813 # the original webpage as well as el=info and el=embedded get_video_info
1814 # requests report video unavailability due to geo restriction while
1815 # el=detailpage succeeds and returns valid data). This is probably
1816 # due to YouTube measures against IP ranges of hosting providers.
1817 # Working around by preferring the first succeeded video_info containing
1818 # the token if no such video_info yet was found.
1819 token = extract_token(video_info)
1820 if not token:
1821 video_info = get_video_info
1822 break
1823
1824 def extract_unavailable_message():
1825 messages = []
1826 for tag, kind in (('h1', 'message'), ('div', 'submessage')):
1827 msg = self._html_search_regex(
1828 r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
1829 video_webpage, 'unavailable %s' % kind, default=None)
1830 if msg:
1831 messages.append(msg)
1832 if messages:
1833 return '\n'.join(messages)
1834
1835 if not video_info:
1836 unavailable_message = extract_unavailable_message()
1837 if not unavailable_message:
1838 unavailable_message = 'Unable to extract video data'
1839 raise ExtractorError(
1840 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
1841
1842 video_details = try_get(
1843 player_response, lambda x: x['videoDetails'], dict) or {}
1844
1845 video_title = video_info.get('title', [None])[0] or video_details.get('title')
1846 if not video_title:
1847 self._downloader.report_warning('Unable to extract video title')
1848 video_title = '_'
1849
1850 description_original = video_description = get_element_by_id("eow-description", video_webpage)
1851 if video_description:
1852
1853 def replace_url(m):
1854 redir_url = compat_urlparse.urljoin(url, m.group(1))
1855 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1856 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1857 qs = compat_parse_qs(parsed_redir_url.query)
1858 q = qs.get('q')
1859 if q and q[0]:
1860 return q[0]
1861 return redir_url
1862
1863 description_original = video_description = re.sub(r'''(?x)
1864 <a\s+
1865 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1866 (?:title|href)="([^"]+)"\s+
1867 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1868 class="[^"]*"[^>]*>
1869 [^<]+\.{3}\s*
1870 </a>
1871 ''', replace_url, video_description)
1872 video_description = clean_html(video_description)
1873 else:
1874 video_description = self._html_search_meta('description', video_webpage) or video_details.get('shortDescription')
1875
1876 if not smuggled_data.get('force_singlefeed', False):
1877 if not self._downloader.params.get('noplaylist'):
1878 multifeed_metadata_list = try_get(
1879 player_response,
1880 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
1881 compat_str) or try_get(
1882 video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
1883 if multifeed_metadata_list:
1884 entries = []
1885 feed_ids = []
1886 for feed in multifeed_metadata_list.split(','):
1887 # Unquote should take place before split on comma (,) since textual
1888 # fields may contain comma as well (see
1889 # https://github.com/ytdl-org/youtube-dl/issues/8536)
1890 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
1891 entries.append({
1892 '_type': 'url_transparent',
1893 'ie_key': 'Youtube',
1894 'url': smuggle_url(
1895 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1896 {'force_singlefeed': True}),
1897 'title': '%s (%s)' % (video_title, feed_data['title'][0]),
1898 })
1899 feed_ids.append(feed_data['id'][0])
1900 self.to_screen(
1901 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1902 % (', '.join(feed_ids), video_id))
1903 return self.playlist_result(entries, video_id, video_title, video_description)
1904 else:
1905 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1906
1907 if view_count is None:
1908 view_count = extract_view_count(video_info)
1909 if view_count is None and video_details:
1910 view_count = int_or_none(video_details.get('viewCount'))
1911
1912 if is_live is None:
1913 is_live = bool_or_none(video_details.get('isLive'))
1914
1915 # Check for "rental" videos
1916 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1917 raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
1918
1919 def _extract_filesize(media_url):
1920 return int_or_none(self._search_regex(
1921 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
1922
1923 streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
1924 streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
1925
1926 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1927 self.report_rtmp_download()
1928 formats = [{
1929 'format_id': '_rtmp',
1930 'protocol': 'rtmp',
1931 'url': video_info['conn'][0],
1932 'player_url': player_url,
1933 }]
1934 elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
1935 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
1936 if 'rtmpe%3Dyes' in encoded_url_map:
1937 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
1938 formats = []
1939 formats_spec = {}
1940 fmt_list = video_info.get('fmt_list', [''])[0]
1941 if fmt_list:
1942 for fmt in fmt_list.split(','):
1943 spec = fmt.split('/')
1944 if len(spec) > 1:
1945 width_height = spec[1].split('x')
1946 if len(width_height) == 2:
1947 formats_spec[spec[0]] = {
1948 'resolution': spec[1],
1949 'width': int_or_none(width_height[0]),
1950 'height': int_or_none(width_height[1]),
1951 }
1952 for fmt in streaming_formats:
1953 itag = str_or_none(fmt.get('itag'))
1954 if not itag:
1955 continue
1956 quality = fmt.get('quality')
1957 quality_label = fmt.get('qualityLabel') or quality
1958 formats_spec[itag] = {
1959 'asr': int_or_none(fmt.get('audioSampleRate')),
1960 'filesize': int_or_none(fmt.get('contentLength')),
1961 'format_note': quality_label,
1962 'fps': int_or_none(fmt.get('fps')),
1963 'height': int_or_none(fmt.get('height')),
1964 # bitrate for itag 43 is always 2147483647
1965 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
1966 'width': int_or_none(fmt.get('width')),
1967 }
1968
1969 for fmt in streaming_formats:
1970 if fmt.get('drm_families'):
1971 continue
1972 url = url_or_none(fmt.get('url'))
1973
1974 if not url:
1975 cipher = fmt.get('cipher')
1976 if not cipher:
1977 continue
1978 url_data = compat_parse_qs(cipher)
1979 url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
1980 if not url:
1981 continue
1982 else:
1983 cipher = None
1984 url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
1985
1986 stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
1987 # Unsupported FORMAT_STREAM_TYPE_OTF
1988 if stream_type == 3:
1989 continue
1990
1991 format_id = fmt.get('itag') or url_data['itag'][0]
1992 if not format_id:
1993 continue
1994 format_id = compat_str(format_id)
1995
1996 if cipher:
1997 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
1998 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
1999 jsplayer_url_json = self._search_regex(
2000 ASSETS_RE,
2001 embed_webpage if age_gate else video_webpage,
2002 'JS player URL (1)', default=None)
2003 if not jsplayer_url_json and not age_gate:
2004 # We need the embed website after all
2005 if embed_webpage is None:
2006 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
2007 embed_webpage = self._download_webpage(
2008 embed_url, video_id, 'Downloading embed webpage')
2009 jsplayer_url_json = self._search_regex(
2010 ASSETS_RE, embed_webpage, 'JS player URL')
2011
2012 player_url = json.loads(jsplayer_url_json)
2013 if player_url is None:
2014 player_url_json = self._search_regex(
2015 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
2016 video_webpage, 'age gate player URL')
2017 player_url = json.loads(player_url_json)
2018
2019 if 'sig' in url_data:
2020 url += '&signature=' + url_data['sig'][0]
2021 elif 's' in url_data:
2022 encrypted_sig = url_data['s'][0]
2023
2024 if self._downloader.params.get('verbose'):
2025 if player_url is None:
2026 player_version = 'unknown'
2027 player_desc = 'unknown'
2028 else:
2029 if player_url.endswith('swf'):
2030 player_version = self._search_regex(
2031 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
2032 'flash player', fatal=False)
2033 player_desc = 'flash player %s' % player_version
2034 else:
2035 player_version = self._search_regex(
2036 [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',
2037 r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'],
2038 player_url,
2039 'html5 player', fatal=False)
2040 player_desc = 'html5 player %s' % player_version
2041
2042 parts_sizes = self._signature_cache_id(encrypted_sig)
2043 self.to_screen('{%s} signature length %s, %s' %
2044 (format_id, parts_sizes, player_desc))
2045
2046 signature = self._decrypt_signature(
2047 encrypted_sig, video_id, player_url, age_gate)
2048 sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
2049 url += '&%s=%s' % (sp, signature)
2050 if 'ratebypass' not in url:
2051 url += '&ratebypass=yes'
2052
2053 dct = {
2054 'format_id': format_id,
2055 'url': url,
2056 'player_url': player_url,
2057 }
2058 if format_id in self._formats:
2059 dct.update(self._formats[format_id])
2060 if format_id in formats_spec:
2061 dct.update(formats_spec[format_id])
2062
2063 # Some itags are not included in DASH manifest thus corresponding formats will
2064 # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).
2065 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
2066 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
2067 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
2068
2069 if width is None:
2070 width = int_or_none(fmt.get('width'))
2071 if height is None:
2072 height = int_or_none(fmt.get('height'))
2073
2074 filesize = int_or_none(url_data.get(
2075 'clen', [None])[0]) or _extract_filesize(url)
2076
2077 quality = url_data.get('quality', [None])[0] or fmt.get('quality')
2078 quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
2079
2080 tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
2081 or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
2082 fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
2083
2084 more_fields = {
2085 'filesize': filesize,
2086 'tbr': tbr,
2087 'width': width,
2088 'height': height,
2089 'fps': fps,
2090 'format_note': quality_label or quality,
2091 }
2092 for key, value in more_fields.items():
2093 if value:
2094 dct[key] = value
2095 type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
2096 if type_:
2097 type_split = type_.split(';')
2098 kind_ext = type_split[0].split('/')
2099 if len(kind_ext) == 2:
2100 kind, _ = kind_ext
2101 dct['ext'] = mimetype2ext(type_split[0])
2102 if kind in ('audio', 'video'):
2103 codecs = None
2104 for mobj in re.finditer(
2105 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
2106 if mobj.group('key') == 'codecs':
2107 codecs = mobj.group('val')
2108 break
2109 if codecs:
2110 dct.update(parse_codecs(codecs))
2111 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
2112 dct['downloader_options'] = {
2113 # Youtube throttles chunks >~10M
2114 'http_chunk_size': 10485760,
2115 }
2116 formats.append(dct)
2117 else:
2118 manifest_url = (
2119 url_or_none(try_get(
2120 player_response,
2121 lambda x: x['streamingData']['hlsManifestUrl'],
2122 compat_str))
2123 or url_or_none(try_get(
2124 video_info, lambda x: x['hlsvp'][0], compat_str)))
2125 if manifest_url:
2126 formats = []
2127 m3u8_formats = self._extract_m3u8_formats(
2128 manifest_url, video_id, 'mp4', fatal=False)
2129 for a_format in m3u8_formats:
2130 itag = self._search_regex(
2131 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
2132 if itag:
2133 a_format['format_id'] = itag
2134 if itag in self._formats:
2135 dct = self._formats[itag].copy()
2136 dct.update(a_format)
2137 a_format = dct
2138 a_format['player_url'] = player_url
2139 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
2140 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
2141 formats.append(a_format)
2142 else:
2143 error_message = extract_unavailable_message()
2144 if not error_message:
2145 error_message = clean_html(try_get(
2146 player_response, lambda x: x['playabilityStatus']['reason'],
2147 compat_str))
2148 if not error_message:
2149 error_message = clean_html(
2150 try_get(video_info, lambda x: x['reason'][0], compat_str))
2151 if error_message:
2152 raise ExtractorError(error_message, expected=True)
2153 raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
2154
2155 # uploader
2156 video_uploader = try_get(
2157 video_info, lambda x: x['author'][0],
2158 compat_str) or str_or_none(video_details.get('author'))
2159 if video_uploader:
2160 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
2161 else:
2162 self._downloader.report_warning('unable to extract uploader name')
2163
2164 # uploader_id
2165 video_uploader_id = None
2166 video_uploader_url = None
2167 mobj = re.search(
2168 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
2169 video_webpage)
2170 if mobj is not None:
2171 video_uploader_id = mobj.group('uploader_id')
2172 video_uploader_url = mobj.group('uploader_url')
2173 else:
2174 self._downloader.report_warning('unable to extract uploader nickname')
2175
2176 channel_id = (
2177 str_or_none(video_details.get('channelId'))
2178 or self._html_search_meta(
2179 'channelId', video_webpage, 'channel id', default=None)
2180 or self._search_regex(
2181 r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
2182 video_webpage, 'channel id', default=None, group='id'))
2183 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
2184
2185 # thumbnail image
2186 # We try first to get a high quality image:
2187 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
2188 video_webpage, re.DOTALL)
2189 if m_thumb is not None:
2190 video_thumbnail = m_thumb.group(1)
2191 elif 'thumbnail_url' not in video_info:
2192 self._downloader.report_warning('unable to extract video thumbnail')
2193 video_thumbnail = None
2194 else: # don't panic if we can't find it
2195 video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
2196
2197 # upload date
2198 upload_date = self._html_search_meta(
2199 'datePublished', video_webpage, 'upload date', default=None)
2200 if not upload_date:
2201 upload_date = self._search_regex(
2202 [r'(?s)id="eow-date.*?>(.*?)</span>',
2203 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
2204 video_webpage, 'upload date', default=None)
2205 upload_date = unified_strdate(upload_date)
2206
2207 video_license = self._html_search_regex(
2208 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
2209 video_webpage, 'license', default=None)
2210
2211 m_music = re.search(
2212 r'''(?x)
2213 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
2214 <ul[^>]*>\s*
2215 <li>(?P<title>.+?)
2216 by (?P<creator>.+?)
2217 (?:
2218 \(.+?\)|
2219 <a[^>]*
2220 (?:
2221 \bhref=["\']/red[^>]*>| # drop possible
2222 >\s*Listen ad-free with YouTube Red # YouTube Red ad
2223 )
2224 .*?
2225 )?</li
2226 ''',
2227 video_webpage)
2228 if m_music:
2229 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
2230 video_creator = clean_html(m_music.group('creator'))
2231 else:
2232 video_alt_title = video_creator = None
2233
2234 def extract_meta(field):
2235 return self._html_search_regex(
2236 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
2237 video_webpage, field, default=None)
2238
2239 track = extract_meta('Song')
2240 artist = extract_meta('Artist')
2241 album = extract_meta('Album')
2242
2243 # Youtube Music Auto-generated description
2244 release_date = release_year = None
2245 if video_description:
2246 mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description)
2247 if mobj:
2248 if not track:
2249 track = mobj.group('track').strip()
2250 if not artist:
2251 artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))
2252 if not album:
2253 album = mobj.group('album'.strip())
2254 release_year = mobj.group('release_year')
2255 release_date = mobj.group('release_date')
2256 if release_date:
2257 release_date = release_date.replace('-', '')
2258 if not release_year:
2259 release_year = int(release_date[:4])
2260 if release_year:
2261 release_year = int(release_year)
2262
2263 m_episode = re.search(
2264 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
2265 video_webpage)
2266 if m_episode:
2267 series = unescapeHTML(m_episode.group('series'))
2268 season_number = int(m_episode.group('season'))
2269 episode_number = int(m_episode.group('episode'))
2270 else:
2271 series = season_number = episode_number = None
2272
2273 m_cat_container = self._search_regex(
2274 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
2275 video_webpage, 'categories', default=None)
2276 if m_cat_container:
2277 category = self._html_search_regex(
2278 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
2279 default=None)
2280 video_categories = None if category is None else [category]
2281 else:
2282 video_categories = None
2283
2284 video_tags = [
2285 unescapeHTML(m.group('content'))
2286 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
2287
2288 def _extract_count(count_name):
2289 return str_to_int(self._search_regex(
2290 r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
2291 % re.escape(count_name),
2292 video_webpage, count_name, default=None))
2293
2294 like_count = _extract_count('like')
2295 dislike_count = _extract_count('dislike')
2296
2297 if view_count is None:
2298 view_count = str_to_int(self._search_regex(
2299 r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
2300 'view count', default=None))
2301
2302 average_rating = (
2303 float_or_none(video_details.get('averageRating'))
2304 or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
2305
2306 # subtitles
2307 video_subtitles = self.extract_subtitles(video_id, video_webpage)
2308 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
2309
2310 video_duration = try_get(
2311 video_info, lambda x: int_or_none(x['length_seconds'][0]))
2312 if not video_duration:
2313 video_duration = int_or_none(video_details.get('lengthSeconds'))
2314 if not video_duration:
2315 video_duration = parse_duration(self._html_search_meta(
2316 'duration', video_webpage, 'video duration'))
2317
2318 # annotations
2319 video_annotations = None
2320 if self._downloader.params.get('writeannotations', False):
2321 xsrf_token = self._search_regex(
2322 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2',
2323 video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
2324 invideo_url = try_get(
2325 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
2326 if xsrf_token and invideo_url:
2327 xsrf_field_name = self._search_regex(
2328 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
2329 video_webpage, 'xsrf field name',
2330 group='xsrf_field_name', default='session_token')
2331 video_annotations = self._download_webpage(
2332 self._proto_relative_url(invideo_url),
2333 video_id, note='Downloading annotations',
2334 errnote='Unable to download video annotations', fatal=False,
2335 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
2336
2337 chapters = self._extract_chapters(description_original, video_duration)
2338
2339 # Look for the DASH manifest
2340 if self._downloader.params.get('youtube_include_dash_manifest', True):
2341 dash_mpd_fatal = True
2342 for mpd_url in dash_mpds:
2343 dash_formats = {}
2344 try:
2345 def decrypt_sig(mobj):
2346 s = mobj.group(1)
2347 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2348 return '/signature/%s' % dec_s
2349
2350 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2351
2352 for df in self._extract_mpd_formats(
2353 mpd_url, video_id, fatal=dash_mpd_fatal,
2354 formats_dict=self._formats):
2355 if not df.get('filesize'):
2356 df['filesize'] = _extract_filesize(df['url'])
2357 # Do not overwrite DASH format found in some previous DASH manifest
2358 if df['format_id'] not in dash_formats:
2359 dash_formats[df['format_id']] = df
2360 # Additional DASH manifests may end up in HTTP Error 403 therefore
2361 # allow them to fail without bug report message if we already have
2362 # some DASH manifest succeeded. This is temporary workaround to reduce
2363 # burst of bug reports until we figure out the reason and whether it
2364 # can be fixed at all.
2365 dash_mpd_fatal = False
2366 except (ExtractorError, KeyError) as e:
2367 self.report_warning(
2368 'Skipping DASH manifest: %r' % e, video_id)
2369 if dash_formats:
2370 # Remove the formats we found through non-DASH, they
2371 # contain less info and it can be wrong, because we use
2372 # fixed values (for example the resolution). See
2373 # https://github.com/ytdl-org/youtube-dl/issues/5774 for an
2374 # example.
2375 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
2376 formats.extend(dash_formats.values())
2377
2378 # Check for malformed aspect ratio
2379 stretched_m = re.search(
2380 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2381 video_webpage)
2382 if stretched_m:
2383 w = float(stretched_m.group('w'))
2384 h = float(stretched_m.group('h'))
2385 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2386 # We will only process correct ratios.
2387 if w > 0 and h > 0:
2388 ratio = w / h
2389 for f in formats:
2390 if f.get('vcodec') != 'none':
2391 f['stretched_ratio'] = ratio
2392
2393 if not formats:
2394 token = extract_token(video_info)
2395 if not token:
2396 if 'reason' in video_info:
2397 if 'The uploader has not made this video available in your country.' in video_info['reason']:
2398 regions_allowed = self._html_search_meta(
2399 'regionsAllowed', video_webpage, default=None)
2400 countries = regions_allowed.split(',') if regions_allowed else None
2401 self.raise_geo_restricted(
2402 msg=video_info['reason'][0], countries=countries)
2403 reason = video_info['reason'][0]
2404 if 'Invalid parameters' in reason:
2405 unavailable_message = extract_unavailable_message()
2406 if unavailable_message:
2407 reason = unavailable_message
2408 raise ExtractorError(
2409 'YouTube said: %s' % reason,
2410 expected=True, video_id=video_id)
2411 else:
2412 raise ExtractorError(
2413 '"token" parameter not in video info for unknown reason',
2414 video_id=video_id)
2415
2416 if not formats and (video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos'])):
2417 raise ExtractorError('This video is DRM protected.', expected=True)
2418
2419 self._sort_formats(formats)
2420
2421 self.mark_watched(video_id, video_info, player_response)
2422
2423 return {
2424 'id': video_id,
2425 'uploader': video_uploader,
2426 'uploader_id': video_uploader_id,
2427 'uploader_url': video_uploader_url,
2428 'channel_id': channel_id,
2429 'channel_url': channel_url,
2430 'upload_date': upload_date,
2431 'license': video_license,
2432 'creator': video_creator or artist,
2433 'title': video_title,
2434 'alt_title': video_alt_title or track,
2435 'thumbnail': video_thumbnail,
2436 'description': video_description,
2437 'categories': video_categories,
2438 'tags': video_tags,
2439 'subtitles': video_subtitles,
2440 'automatic_captions': automatic_captions,
2441 'duration': video_duration,
2442 'age_limit': 18 if age_gate else 0,
2443 'annotations': video_annotations,
2444 'chapters': chapters,
2445 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
2446 'view_count': view_count,
2447 'like_count': like_count,
2448 'dislike_count': dislike_count,
2449 'average_rating': average_rating,
2450 'formats': formats,
2451 'is_live': is_live,
2452 'start_time': start_time,
2453 'end_time': end_time,
2454 'series': series,
2455 'season_number': season_number,
2456 'episode_number': episode_number,
2457 'track': track,
2458 'artist': artist,
2459 'album': album,
2460 'release_date': release_date,
2461 'release_year': release_year,
2462 }
2463
2464
2465 class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
2466 IE_DESC = 'YouTube.com playlists'
2467 _VALID_URL = r"""(?x)(?:
2468 (?:https?://)?
2469 (?:\w+\.)?
2470 (?:
2471 (?:
2472 youtube(?:kids)?\.com|
2473 invidio\.us
2474 )
2475 /
2476 (?:
2477 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))
2478 \? (?:.*?[&;])*? (?:p|a|list)=
2479 | p/
2480 )|
2481 youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
2482 )
2483 (
2484 (?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)?[0-9A-Za-z-_]{10,}
2485 # Top tracks, they can also include dots
2486 |(?:MC)[\w\.]*
2487 )
2488 .*
2489 |
2490 (%(playlist_id)s)
2491 )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
2492 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
2493 _VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&amp;(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?'
2494 _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})'
2495 IE_NAME = 'youtube:playlist'
2496 _TESTS = [{
2497 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
2498 'info_dict': {
2499 'title': 'ytdl test PL',
2500 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
2501 },
2502 'playlist_count': 3,
2503 }, {
2504 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
2505 'info_dict': {
2506 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
2507 'title': 'YDL_Empty_List',
2508 },
2509 'playlist_count': 0,
2510 'skip': 'This playlist is private',
2511 }, {
2512 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2513 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2514 'info_dict': {
2515 'title': '29C3: Not my department',
2516 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2517 'uploader': 'Christiaan008',
2518 'uploader_id': 'ChRiStIaAn008',
2519 },
2520 'playlist_count': 95,
2521 }, {
2522 'note': 'issue #673',
2523 'url': 'PLBB231211A4F62143',
2524 'info_dict': {
2525 'title': '[OLD]Team Fortress 2 (Class-based LP)',
2526 'id': 'PLBB231211A4F62143',
2527 'uploader': 'Wickydoo',
2528 'uploader_id': 'Wickydoo',
2529 },
2530 'playlist_mincount': 26,
2531 }, {
2532 'note': 'Large playlist',
2533 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
2534 'info_dict': {
2535 'title': 'Uploads from Cauchemar',
2536 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
2537 'uploader': 'Cauchemar',
2538 'uploader_id': 'Cauchemar89',
2539 },
2540 'playlist_mincount': 799,
2541 }, {
2542 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2543 'info_dict': {
2544 'title': 'YDL_safe_search',
2545 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2546 },
2547 'playlist_count': 2,
2548 'skip': 'This playlist is private',
2549 }, {
2550 'note': 'embedded',
2551 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
2552 'playlist_count': 4,
2553 'info_dict': {
2554 'title': 'JODA15',
2555 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
2556 'uploader': 'milan',
2557 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
2558 }
2559 }, {
2560 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2561 'playlist_mincount': 485,
2562 'info_dict': {
2563 'title': '2018 Chinese New Singles (11/6 updated)',
2564 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2565 'uploader': 'LBK',
2566 'uploader_id': 'sdragonfang',
2567 }
2568 }, {
2569 'note': 'Embedded SWF player',
2570 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
2571 'playlist_count': 4,
2572 'info_dict': {
2573 'title': 'JODA7',
2574 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
2575 },
2576 'skip': 'This playlist does not exist',
2577 }, {
2578 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2579 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2580 'info_dict': {
2581 'title': 'Uploads from Interstellar Movie',
2582 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
2583 'uploader': 'Interstellar Movie',
2584 'uploader_id': 'InterstellarMovie1',
2585 },
2586 'playlist_mincount': 21,
2587 }, {
2588 # Playlist URL that does not actually serve a playlist
2589 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2590 'info_dict': {
2591 'id': 'FqZTN594JQw',
2592 'ext': 'webm',
2593 'title': "Smiley's People 01 detective, Adventure Series, Action",
2594 'uploader': 'STREEM',
2595 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
2596 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
2597 'upload_date': '20150526',
2598 'license': 'Standard YouTube License',
2599 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2600 'categories': ['People & Blogs'],
2601 'tags': list,
2602 'view_count': int,
2603 'like_count': int,
2604 'dislike_count': int,
2605 },
2606 'params': {
2607 'skip_download': True,
2608 },
2609 'skip': 'This video is not available.',
2610 'add_ie': [YoutubeIE.ie_key()],
2611 }, {
2612 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
2613 'info_dict': {
2614 'id': 'yeWKywCrFtk',
2615 'ext': 'mp4',
2616 'title': 'Small Scale Baler and Braiding Rugs',
2617 'uploader': 'Backus-Page House Museum',
2618 'uploader_id': 'backuspagemuseum',
2619 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
2620 'upload_date': '20161008',
2621 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
2622 'categories': ['Nonprofits & Activism'],
2623 'tags': list,
2624 'like_count': int,
2625 'dislike_count': int,
2626 },
2627 'params': {
2628 'noplaylist': True,
2629 'skip_download': True,
2630 },
2631 }, {
2632 # https://github.com/ytdl-org/youtube-dl/issues/21844
2633 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2634 'info_dict': {
2635 'title': 'Data Analysis with Dr Mike Pound',
2636 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2637 'uploader_id': 'Computerphile',
2638 'uploader': 'Computerphile',
2639 },
2640 'playlist_mincount': 11,
2641 }, {
2642 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
2643 'only_matching': True,
2644 }, {
2645 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
2646 'only_matching': True,
2647 }, {
2648 # music album playlist
2649 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
2650 'only_matching': True,
2651 }, {
2652 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',
2653 'only_matching': True,
2654 }, {
2655 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
2656 'only_matching': True,
2657 }]
2658
2659 def _real_initialize(self):
2660 self._login()
2661
2662 def extract_videos_from_page(self, page):
2663 ids_in_page = []
2664 titles_in_page = []
2665
2666 for item in re.findall(
2667 r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page):
2668 attrs = extract_attributes(item)
2669 video_id = attrs['data-video-id']
2670 video_title = unescapeHTML(attrs.get('data-title'))
2671 if video_title:
2672 video_title = video_title.strip()
2673 ids_in_page.append(video_id)
2674 titles_in_page.append(video_title)
2675
2676 # Fallback with old _VIDEO_RE
2677 self.extract_videos_from_page_impl(
2678 self._VIDEO_RE, page, ids_in_page, titles_in_page)
2679
2680 # Relaxed fallbacks
2681 self.extract_videos_from_page_impl(
2682 r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page,
2683 ids_in_page, titles_in_page)
2684 self.extract_videos_from_page_impl(
2685 r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page,
2686 ids_in_page, titles_in_page)
2687
2688 return zip(ids_in_page, titles_in_page)
2689
2690 def _extract_mix(self, playlist_id):
2691 # The mixes are generated from a single video
2692 # the id of the playlist is just 'RD' + video_id
2693 ids = []
2694 last_id = playlist_id[-11:]
2695 for n in itertools.count(1):
2696 url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
2697 webpage = self._download_webpage(
2698 url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))
2699 new_ids = orderedSet(re.findall(
2700 r'''(?xs)data-video-username=".*?".*?
2701 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
2702 webpage))
2703 # Fetch new pages until all the videos are repeated, it seems that
2704 # there are always 51 unique videos.
2705 new_ids = [_id for _id in new_ids if _id not in ids]
2706 if not new_ids:
2707 break
2708 ids.extend(new_ids)
2709 last_id = ids[-1]
2710
2711 url_results = self._ids_to_results(ids)
2712
2713 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
2714 title_span = (
2715 search_title('playlist-title')
2716 or search_title('title long-title')
2717 or search_title('title'))
2718 title = clean_html(title_span)
2719
2720 return self.playlist_result(url_results, playlist_id, title)
2721
2722 def _extract_playlist(self, playlist_id):
2723 url = self._TEMPLATE_URL % playlist_id
2724 page = self._download_webpage(url, playlist_id)
2725
2726 # the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604)
2727 for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page):
2728 match = match.strip()
2729 # Check if the playlist exists or is private
2730 mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match)
2731 if mobj:
2732 reason = mobj.group('reason')
2733 message = 'This playlist %s' % reason
2734 if 'private' in reason:
2735 message += ', use --username or --netrc to access it'
2736 message += '.'
2737 raise ExtractorError(message, expected=True)
2738 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
2739 raise ExtractorError(
2740 'Invalid parameters. Maybe URL is incorrect.',
2741 expected=True)
2742 elif re.match(r'[^<]*Choose your language[^<]*', match):
2743 continue
2744 else:
2745 self.report_warning('Youtube gives an alert message: ' + match)
2746
2747 playlist_title = self._html_search_regex(
2748 r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
2749 page, 'title', default=None)
2750
2751 _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref='
2752 uploader = self._html_search_regex(
2753 r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,
2754 page, 'uploader', default=None)
2755 mobj = re.search(
2756 r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE,
2757 page)
2758 if mobj:
2759 uploader_id = mobj.group('uploader_id')
2760 uploader_url = compat_urlparse.urljoin(url, mobj.group('path'))
2761 else:
2762 uploader_id = uploader_url = None
2763
2764 has_videos = True
2765
2766 if not playlist_title:
2767 try:
2768 # Some playlist URLs don't actually serve a playlist (e.g.
2769 # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)
2770 next(self._entries(page, playlist_id))
2771 except StopIteration:
2772 has_videos = False
2773
2774 playlist = self.playlist_result(
2775 self._entries(page, playlist_id), playlist_id, playlist_title)
2776 playlist.update({
2777 'uploader': uploader,
2778 'uploader_id': uploader_id,
2779 'uploader_url': uploader_url,
2780 })
2781
2782 return has_videos, playlist
2783
2784 def _check_download_just_video(self, url, playlist_id):
2785 # Check if it's a video-specific URL
2786 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
2787 video_id = query_dict.get('v', [None])[0] or self._search_regex(
2788 r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url,
2789 'video id', default=None)
2790 if video_id:
2791 if self._downloader.params.get('noplaylist'):
2792 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
2793 return video_id, self.url_result(video_id, 'Youtube', video_id=video_id)
2794 else:
2795 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
2796 return video_id, None
2797 return None, None
2798
2799 def _real_extract(self, url):
2800 # Extract playlist id
2801 mobj = re.match(self._VALID_URL, url)
2802 if mobj is None:
2803 raise ExtractorError('Invalid URL: %s' % url)
2804 playlist_id = mobj.group(1) or mobj.group(2)
2805
2806 video_id, video = self._check_download_just_video(url, playlist_id)
2807 if video:
2808 return video
2809
2810 if playlist_id.startswith(('RD', 'UL', 'PU')):
2811 # Mixes require a custom extraction process
2812 return self._extract_mix(playlist_id)
2813
2814 has_videos, playlist = self._extract_playlist(playlist_id)
2815 if has_videos or not video_id:
2816 return playlist
2817
2818 # Some playlist URLs don't actually serve a playlist (see
2819 # https://github.com/ytdl-org/youtube-dl/issues/10537).
2820 # Fallback to plain video extraction if there is a video id
2821 # along with playlist id.
2822 return self.url_result(video_id, 'Youtube', video_id=video_id)
2823
2824
2825 class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
2826 IE_DESC = 'YouTube.com channels'
2827 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie|kids)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)'
2828 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
2829 _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
2830 IE_NAME = 'youtube:channel'
2831 _TESTS = [{
2832 'note': 'paginated channel',
2833 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
2834 'playlist_mincount': 91,
2835 'info_dict': {
2836 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
2837 'title': 'Uploads from lex will',
2838 'uploader': 'lex will',
2839 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2840 }
2841 }, {
2842 'note': 'Age restricted channel',
2843 # from https://www.youtube.com/user/DeusExOfficial
2844 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',
2845 'playlist_mincount': 64,
2846 'info_dict': {
2847 'id': 'UUs0ifCMCm1icqRbqhUINa0w',
2848 'title': 'Uploads from Deus Ex',
2849 'uploader': 'Deus Ex',
2850 'uploader_id': 'DeusExOfficial',
2851 },
2852 }, {
2853 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',
2854 'only_matching': True,
2855 }, {
2856 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA',
2857 'only_matching': True,
2858 }]
2859
2860 @classmethod
2861 def suitable(cls, url):
2862 return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)
2863 else super(YoutubeChannelIE, cls).suitable(url))
2864
2865 def _build_template_url(self, url, channel_id):
2866 return self._TEMPLATE_URL % channel_id
2867
2868 def _real_extract(self, url):
2869 channel_id = self._match_id(url)
2870
2871 url = self._build_template_url(url, channel_id)
2872
2873 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
2874 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
2875 # otherwise fallback on channel by page extraction
2876 channel_page = self._download_webpage(
2877 url + '?view=57', channel_id,
2878 'Downloading channel page', fatal=False)
2879 if channel_page is False:
2880 channel_playlist_id = False
2881 else:
2882 channel_playlist_id = self._html_search_meta(
2883 'channelId', channel_page, 'channel id', default=None)
2884 if not channel_playlist_id:
2885 channel_url = self._html_search_meta(
2886 ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),
2887 channel_page, 'channel url', default=None)
2888 if channel_url:
2889 channel_playlist_id = self._search_regex(
2890 r'vnd\.youtube://user/([0-9A-Za-z_-]+)',
2891 channel_url, 'channel id', default=None)
2892 if channel_playlist_id and channel_playlist_id.startswith('UC'):
2893 playlist_id = 'UU' + channel_playlist_id[2:]
2894 return self.url_result(
2895 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
2896
2897 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
2898 autogenerated = re.search(r'''(?x)
2899 class="[^"]*?(?:
2900 channel-header-autogenerated-label|
2901 yt-channel-title-autogenerated
2902 )[^"]*"''', channel_page) is not None
2903
2904 if autogenerated:
2905 # The videos are contained in a single page
2906 # the ajax pages can't be used, they are empty
2907 entries = [
2908 self.url_result(
2909 video_id, 'Youtube', video_id=video_id,
2910 video_title=video_title)
2911 for video_id, video_title in self.extract_videos_from_page(channel_page)]
2912 return self.playlist_result(entries, channel_id)
2913
2914 try:
2915 next(self._entries(channel_page, channel_id))
2916 except StopIteration:
2917 alert_message = self._html_search_regex(
2918 r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',
2919 channel_page, 'alert', default=None, group='alert')
2920 if alert_message:
2921 raise ExtractorError('Youtube said: %s' % alert_message, expected=True)
2922
2923 return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
2924
2925
2926 class YoutubeUserIE(YoutubeChannelIE):
2927 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
2928 _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
2929 _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
2930 IE_NAME = 'youtube:user'
2931
2932 _TESTS = [{
2933 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
2934 'playlist_mincount': 320,
2935 'info_dict': {
2936 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
2937 'title': 'Uploads from The Linux Foundation',
2938 'uploader': 'The Linux Foundation',
2939 'uploader_id': 'TheLinuxFoundation',
2940 }
2941 }, {
2942 # Only available via https://www.youtube.com/c/12minuteathlete/videos
2943 # but not https://www.youtube.com/user/12minuteathlete/videos
2944 'url': 'https://www.youtube.com/c/12minuteathlete/videos',
2945 'playlist_mincount': 249,
2946 'info_dict': {
2947 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
2948 'title': 'Uploads from 12 Minute Athlete',
2949 'uploader': '12 Minute Athlete',
2950 'uploader_id': 'the12minuteathlete',
2951 }
2952 }, {
2953 'url': 'ytuser:phihag',
2954 'only_matching': True,
2955 }, {
2956 'url': 'https://www.youtube.com/c/gametrailers',
2957 'only_matching': True,
2958 }, {
2959 'url': 'https://www.youtube.com/gametrailers',
2960 'only_matching': True,
2961 }, {
2962 # This channel is not available, geo restricted to JP
2963 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',
2964 'only_matching': True,
2965 }]
2966
2967 @classmethod
2968 def suitable(cls, url):
2969 # Don't return True if the url can be extracted with other youtube
2970 # extractor, the regex would is too permissive and it would match.
2971 other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)
2972 if any(ie.suitable(url) for ie in other_yt_ies):
2973 return False
2974 else:
2975 return super(YoutubeUserIE, cls).suitable(url)
2976
2977 def _build_template_url(self, url, channel_id):
2978 mobj = re.match(self._VALID_URL, url)
2979 return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))
2980
2981
2982 class YoutubeLiveIE(YoutubeBaseInfoExtractor):
2983 IE_DESC = 'YouTube.com live streams'
2984 _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'
2985 IE_NAME = 'youtube:live'
2986
2987 _TESTS = [{
2988 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
2989 'info_dict': {
2990 'id': 'a48o2S1cPoo',
2991 'ext': 'mp4',
2992 'title': 'The Young Turks - Live Main Show',
2993 'uploader': 'The Young Turks',
2994 'uploader_id': 'TheYoungTurks',
2995 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
2996 'upload_date': '20150715',
2997 'license': 'Standard YouTube License',
2998 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
2999 'categories': ['News & Politics'],
3000 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
3001 'like_count': int,
3002 'dislike_count': int,
3003 },
3004 'params': {
3005 'skip_download': True,
3006 },
3007 }, {
3008 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
3009 'only_matching': True,
3010 }, {
3011 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
3012 'only_matching': True,
3013 }, {
3014 'url': 'https://www.youtube.com/TheYoungTurks/live',
3015 'only_matching': True,
3016 }]
3017
3018 def _real_extract(self, url):
3019 mobj = re.match(self._VALID_URL, url)
3020 channel_id = mobj.group('id')
3021 base_url = mobj.group('base_url')
3022 webpage = self._download_webpage(url, channel_id, fatal=False)
3023 if webpage:
3024 page_type = self._og_search_property(
3025 'type', webpage, 'page type', default='')
3026 video_id = self._html_search_meta(
3027 'videoId', webpage, 'video id', default=None)
3028 if page_type.startswith('video') and video_id and re.match(
3029 r'^[0-9A-Za-z_-]{11}$', video_id):
3030 return self.url_result(video_id, YoutubeIE.ie_key())
3031 return self.url_result(base_url)
3032
3033
3034 class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
3035 IE_DESC = 'YouTube.com user/channel playlists'
3036 _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists'
3037 IE_NAME = 'youtube:playlists'
3038
3039 _TESTS = [{
3040 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
3041 'playlist_mincount': 4,
3042 'info_dict': {
3043 'id': 'ThirstForScience',
3044 'title': 'ThirstForScience',
3045 },
3046 }, {
3047 # with "Load more" button
3048 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
3049 'playlist_mincount': 70,
3050 'info_dict': {
3051 'id': 'igorkle1',
3052 'title': 'Игорь Клейнер',
3053 },
3054 }, {
3055 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',
3056 'playlist_mincount': 17,
3057 'info_dict': {
3058 'id': 'UCiU1dHvZObB2iP6xkJ__Icw',
3059 'title': 'Chem Player',
3060 },
3061 'skip': 'Blocked',
3062 }]
3063
3064
3065 class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):
3066 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'
3067
3068
3069 class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
3070 IE_DESC = 'YouTube.com searches'
3071 # there doesn't appear to be a real limit, for example if you search for
3072 # 'python' you get more than 8.000.000 results
3073 _MAX_RESULTS = float('inf')
3074 IE_NAME = 'youtube:search'
3075 _SEARCH_KEY = 'ytsearch'
3076 _EXTRA_QUERY_ARGS = {}
3077 _TESTS = []
3078
3079 def _get_n_results(self, query, n):
3080 """Get a specified number of results for a query"""
3081
3082 videos = []
3083 limit = n
3084
3085 url_query = {
3086 'search_query': query.encode('utf-8'),
3087 }
3088 url_query.update(self._EXTRA_QUERY_ARGS)
3089 result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query)
3090
3091 for pagenum in itertools.count(1):
3092 data = self._download_json(
3093 result_url, video_id='query "%s"' % query,
3094 note='Downloading page %s' % pagenum,
3095 errnote='Unable to download API page',
3096 query={'spf': 'navigate'})
3097 html_content = data[1]['body']['content']
3098
3099 if 'class="search-message' in html_content:
3100 raise ExtractorError(
3101 '[youtube] No video results', expected=True)
3102
3103 new_videos = list(self._process_page(html_content))
3104 videos += new_videos
3105 if not new_videos or len(videos) > limit:
3106 break
3107 next_link = self._html_search_regex(
3108 r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next',
3109 html_content, 'next link', default=None)
3110 if next_link is None:
3111 break
3112 result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link)
3113
3114 if len(videos) > n:
3115 videos = videos[:n]
3116 return self.playlist_result(videos, query)
3117
3118
3119 class YoutubeSearchDateIE(YoutubeSearchIE):
3120 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
3121 _SEARCH_KEY = 'ytsearchdate'
3122 IE_DESC = 'YouTube.com searches, newest videos first'
3123 _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
3124
3125
3126 class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):
3127 IE_DESC = 'YouTube.com search URLs'
3128 IE_NAME = 'youtube:search_url'
3129 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
3130 _TESTS = [{
3131 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
3132 'playlist_mincount': 5,
3133 'info_dict': {
3134 'title': 'youtube-dl test video',
3135 }
3136 }, {
3137 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
3138 'only_matching': True,
3139 }]
3140
3141 def _real_extract(self, url):
3142 mobj = re.match(self._VALID_URL, url)
3143 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
3144 webpage = self._download_webpage(url, query)
3145 return self.playlist_result(self._process_page(webpage), playlist_title=query)
3146
3147
3148 class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
3149 IE_DESC = 'YouTube.com (multi-season) shows'
3150 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
3151 IE_NAME = 'youtube:show'
3152 _TESTS = [{
3153 'url': 'https://www.youtube.com/show/airdisasters',
3154 'playlist_mincount': 5,
3155 'info_dict': {
3156 'id': 'airdisasters',
3157 'title': 'Air Disasters',
3158 }
3159 }]
3160
3161 def _real_extract(self, url):
3162 playlist_id = self._match_id(url)
3163 return super(YoutubeShowIE, self)._real_extract(
3164 'https://www.youtube.com/show/%s/playlists' % playlist_id)
3165
3166
3167 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
3168 """
3169 Base class for feed extractors
3170 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
3171 """
3172 _LOGIN_REQUIRED = True
3173
3174 @property
3175 def IE_NAME(self):
3176 return 'youtube:%s' % self._FEED_NAME
3177
3178 def _real_initialize(self):
3179 self._login()
3180
3181 def _entries(self, page):
3182 # The extraction process is the same as for playlists, but the regex
3183 # for the video ids doesn't contain an index
3184 ids = []
3185 more_widget_html = content_html = page
3186 for page_num in itertools.count(1):
3187 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
3188
3189 # 'recommended' feed has infinite 'load more' and each new portion spins
3190 # the same videos in (sometimes) slightly different order, so we'll check
3191 # for unicity and break when portion has no new videos
3192 new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches)))
3193 if not new_ids:
3194 break
3195
3196 ids.extend(new_ids)
3197
3198 for entry in self._ids_to_results(new_ids):
3199 yield entry
3200
3201 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
3202 if not mobj:
3203 break
3204
3205 more = self._download_json(
3206 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
3207 'Downloading page #%s' % page_num,
3208 transform_source=uppercase_escape)
3209 content_html = more['content_html']
3210 more_widget_html = more['load_more_widget_html']
3211
3212 def _real_extract(self, url):
3213 page = self._download_webpage(
3214 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
3215 self._PLAYLIST_TITLE)
3216 return self.playlist_result(
3217 self._entries(page), playlist_title=self._PLAYLIST_TITLE)
3218
3219
3220 class YoutubeWatchLaterIE(YoutubePlaylistIE):
3221 IE_NAME = 'youtube:watchlater'
3222 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
3223 _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'
3224
3225 _TESTS = [{
3226 'url': 'https://www.youtube.com/playlist?list=WL',
3227 'only_matching': True,
3228 }, {
3229 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',
3230 'only_matching': True,
3231 }]
3232
3233 def _real_extract(self, url):
3234 _, video = self._check_download_just_video(url, 'WL')
3235 if video:
3236 return video
3237 _, playlist = self._extract_playlist('WL')
3238 return playlist
3239
3240
3241 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
3242 IE_NAME = 'youtube:favorites'
3243 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
3244 _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
3245 _LOGIN_REQUIRED = True
3246
3247 def _real_extract(self, url):
3248 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
3249 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
3250 return self.url_result(playlist_id, 'YoutubePlaylist')
3251
3252
3253 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3254 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
3255 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'
3256 _FEED_NAME = 'recommended'
3257 _PLAYLIST_TITLE = 'Youtube Recommended videos'
3258
3259
3260 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
3261 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
3262 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
3263 _FEED_NAME = 'subscriptions'
3264 _PLAYLIST_TITLE = 'Youtube Subscriptions'
3265
3266
3267 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
3268 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
3269 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
3270 _FEED_NAME = 'history'
3271 _PLAYLIST_TITLE = 'Youtube History'
3272
3273
3274 class YoutubeTruncatedURLIE(InfoExtractor):
3275 IE_NAME = 'youtube:truncated_url'
3276 IE_DESC = False # Do not list
3277 _VALID_URL = r'''(?x)
3278 (?:https?://)?
3279 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3280 (?:watch\?(?:
3281 feature=[a-z_]+|
3282 annotation_id=annotation_[^&]+|
3283 x-yt-cl=[0-9]+|
3284 hl=[^&]*|
3285 t=[0-9]+
3286 )?
3287 |
3288 attribution_link\?a=[^&]+
3289 )
3290 $
3291 '''
3292
3293 _TESTS = [{
3294 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
3295 'only_matching': True,
3296 }, {
3297 'url': 'https://www.youtube.com/watch?',
3298 'only_matching': True,
3299 }, {
3300 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3301 'only_matching': True,
3302 }, {
3303 'url': 'https://www.youtube.com/watch?feature=foo',
3304 'only_matching': True,
3305 }, {
3306 'url': 'https://www.youtube.com/watch?hl=en-GB',
3307 'only_matching': True,
3308 }, {
3309 'url': 'https://www.youtube.com/watch?t=2372',
3310 'only_matching': True,
3311 }]
3312
3313 def _real_extract(self, url):
3314 raise ExtractorError(
3315 'Did you forget to quote the URL? Remember that & is a meta '
3316 'character in most shells, so you want to put the URL in quotes, '
3317 'like youtube-dl '
3318 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3319 ' or simply youtube-dl BaW_jenozKc .',
3320 expected=True)
3321
3322
3323 class YoutubeTruncatedIDIE(InfoExtractor):
3324 IE_NAME = 'youtube:truncated_id'
3325 IE_DESC = False # Do not list
3326 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
3327
3328 _TESTS = [{
3329 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3330 'only_matching': True,
3331 }]
3332
3333 def _real_extract(self, url):
3334 video_id = self._match_id(url)
3335 raise ExtractorError(
3336 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3337 expected=True)