]> jfr.im git - yt-dlp.git/blob - youtube_dlc/extractor/youtube.py
Merge pull request #19 from nixxo/patch-1
[yt-dlp.git] / youtube_dlc / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5
6 import itertools
7 import json
8 import os.path
9 import random
10 import re
11 import time
12 import traceback
13
14 from .common import InfoExtractor, SearchInfoExtractor
15 from ..jsinterp import JSInterpreter
16 from ..swfinterp import SWFInterpreter
17 from ..compat import (
18 compat_chr,
19 compat_HTTPError,
20 compat_kwargs,
21 compat_parse_qs,
22 compat_urllib_parse_unquote,
23 compat_urllib_parse_unquote_plus,
24 compat_urllib_parse_urlencode,
25 compat_urllib_parse_urlparse,
26 compat_urlparse,
27 compat_str,
28 )
29 from ..utils import (
30 bool_or_none,
31 clean_html,
32 error_to_compat_str,
33 extract_attributes,
34 ExtractorError,
35 float_or_none,
36 get_element_by_attribute,
37 get_element_by_id,
38 int_or_none,
39 mimetype2ext,
40 orderedSet,
41 parse_codecs,
42 parse_count,
43 parse_duration,
44 remove_quotes,
45 remove_start,
46 smuggle_url,
47 str_or_none,
48 str_to_int,
49 try_get,
50 unescapeHTML,
51 unified_strdate,
52 unsmuggle_url,
53 uppercase_escape,
54 url_or_none,
55 urlencode_postdata,
56 )
57
58
59 class YoutubeBaseInfoExtractor(InfoExtractor):
60 """Provide base functions for Youtube extractors"""
61 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
62 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
63
64 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
65 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
66 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
67
68 _NETRC_MACHINE = 'youtube'
69 # If True it will raise an error if no login info is provided
70 _LOGIN_REQUIRED = False
71
72 _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}'
73
74 _YOUTUBE_CLIENT_HEADERS = {
75 'x-youtube-client-name': '1',
76 'x-youtube-client-version': '1.20200609.04.02',
77 }
78
79 def _set_language(self):
80 self._set_cookie(
81 '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
82 # YouTube sets the expire time to about two months
83 expire_time=time.time() + 2 * 30 * 24 * 3600)
84
85 def _ids_to_results(self, ids):
86 return [
87 self.url_result(vid_id, 'Youtube', video_id=vid_id)
88 for vid_id in ids]
89
90 def _login(self):
91 """
92 Attempt to log in to YouTube.
93 True is returned if successful or skipped.
94 False is returned if login failed.
95
96 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
97 """
98 username, password = self._get_login_info()
99 # No authentication to be performed
100 if username is None:
101 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
102 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
103 if self._downloader.params.get('cookiefile') and False: # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
104 self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
105 return True
106
107 login_page = self._download_webpage(
108 self._LOGIN_URL, None,
109 note='Downloading login page',
110 errnote='unable to fetch login page', fatal=False)
111 if login_page is False:
112 return
113
114 login_form = self._hidden_inputs(login_page)
115
116 def req(url, f_req, note, errnote):
117 data = login_form.copy()
118 data.update({
119 'pstMsg': 1,
120 'checkConnection': 'youtube',
121 'checkedDomains': 'youtube',
122 'hl': 'en',
123 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
124 'f.req': json.dumps(f_req),
125 'flowName': 'GlifWebSignIn',
126 'flowEntry': 'ServiceLogin',
127 # TODO: reverse actual botguard identifier generation algo
128 'bgRequest': '["identifier",""]',
129 })
130 return self._download_json(
131 url, None, note=note, errnote=errnote,
132 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
133 fatal=False,
134 data=urlencode_postdata(data), headers={
135 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
136 'Google-Accounts-XSRF': 1,
137 })
138
139 def warn(message):
140 self._downloader.report_warning(message)
141
142 lookup_req = [
143 username,
144 None, [], None, 'US', None, None, 2, False, True,
145 [
146 None, None,
147 [2, 1, None, 1,
148 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
149 None, [], 4],
150 1, [None, None, []], None, None, None, True
151 ],
152 username,
153 ]
154
155 lookup_results = req(
156 self._LOOKUP_URL, lookup_req,
157 'Looking up account info', 'Unable to look up account info')
158
159 if lookup_results is False:
160 return False
161
162 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
163 if not user_hash:
164 warn('Unable to extract user hash')
165 return False
166
167 challenge_req = [
168 user_hash,
169 None, 1, None, [1, None, None, None, [password, None, True]],
170 [
171 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
172 1, [None, None, []], None, None, None, True
173 ]]
174
175 challenge_results = req(
176 self._CHALLENGE_URL, challenge_req,
177 'Logging in', 'Unable to log in')
178
179 if challenge_results is False:
180 return
181
182 login_res = try_get(challenge_results, lambda x: x[0][5], list)
183 if login_res:
184 login_msg = try_get(login_res, lambda x: x[5], compat_str)
185 warn(
186 'Unable to login: %s' % 'Invalid password'
187 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
188 return False
189
190 res = try_get(challenge_results, lambda x: x[0][-1], list)
191 if not res:
192 warn('Unable to extract result entry')
193 return False
194
195 login_challenge = try_get(res, lambda x: x[0][0], list)
196 if login_challenge:
197 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
198 if challenge_str == 'TWO_STEP_VERIFICATION':
199 # SEND_SUCCESS - TFA code has been successfully sent to phone
200 # QUOTA_EXCEEDED - reached the limit of TFA codes
201 status = try_get(login_challenge, lambda x: x[5], compat_str)
202 if status == 'QUOTA_EXCEEDED':
203 warn('Exceeded the limit of TFA codes, try later')
204 return False
205
206 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
207 if not tl:
208 warn('Unable to extract TL')
209 return False
210
211 tfa_code = self._get_tfa_info('2-step verification code')
212
213 if not tfa_code:
214 warn(
215 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
216 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
217 return False
218
219 tfa_code = remove_start(tfa_code, 'G-')
220
221 tfa_req = [
222 user_hash, None, 2, None,
223 [
224 9, None, None, None, None, None, None, None,
225 [None, tfa_code, True, 2]
226 ]]
227
228 tfa_results = req(
229 self._TFA_URL.format(tl), tfa_req,
230 'Submitting TFA code', 'Unable to submit TFA code')
231
232 if tfa_results is False:
233 return False
234
235 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
236 if tfa_res:
237 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
238 warn(
239 'Unable to finish TFA: %s' % 'Invalid TFA code'
240 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
241 return False
242
243 check_cookie_url = try_get(
244 tfa_results, lambda x: x[0][-1][2], compat_str)
245 else:
246 CHALLENGES = {
247 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
248 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
249 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
250 }
251 challenge = CHALLENGES.get(
252 challenge_str,
253 '%s returned error %s.' % (self.IE_NAME, challenge_str))
254 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
255 return False
256 else:
257 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
258
259 if not check_cookie_url:
260 warn('Unable to extract CheckCookie URL')
261 return False
262
263 check_cookie_results = self._download_webpage(
264 check_cookie_url, None, 'Checking cookie', fatal=False)
265
266 if check_cookie_results is False:
267 return False
268
269 if 'https://myaccount.google.com/' not in check_cookie_results:
270 warn('Unable to log in')
271 return False
272
273 return True
274
275 def _download_webpage_handle(self, *args, **kwargs):
276 query = kwargs.get('query', {}).copy()
277 query['disable_polymer'] = 'true'
278 kwargs['query'] = query
279 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
280 *args, **compat_kwargs(kwargs))
281
282 def _real_initialize(self):
283 if self._downloader is None:
284 return
285 self._set_language()
286 if not self._login():
287 return
288
289
290 class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
291 # Extract entries from page with "Load more" button
292 def _entries(self, page, playlist_id):
293 more_widget_html = content_html = page
294 for page_num in itertools.count(1):
295 for entry in self._process_page(content_html):
296 yield entry
297
298 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
299 if not mobj:
300 break
301
302 count = 0
303 retries = 3
304 while count <= retries:
305 try:
306 # Downloading page may result in intermittent 5xx HTTP error
307 # that is usually worked around with a retry
308 more = self._download_json(
309 'https://www.youtube.com/%s' % mobj.group('more'), playlist_id,
310 'Downloading page #%s%s'
311 % (page_num, ' (retry #%d)' % count if count else ''),
312 transform_source=uppercase_escape,
313 headers=self._YOUTUBE_CLIENT_HEADERS)
314 break
315 except ExtractorError as e:
316 if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
317 count += 1
318 if count <= retries:
319 continue
320 raise
321
322 content_html = more['content_html']
323 if not content_html.strip():
324 # Some webpages show a "Load more" button but they don't
325 # have more videos
326 break
327 more_widget_html = more['load_more_widget_html']
328
329
330 class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
331 def _process_page(self, content):
332 for video_id, video_title in self.extract_videos_from_page(content):
333 yield self.url_result(video_id, 'Youtube', video_id, video_title)
334
335 def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page):
336 for mobj in re.finditer(video_re, page):
337 # The link with index 0 is not the first video of the playlist (not sure if still actual)
338 if 'index' in mobj.groupdict() and mobj.group('id') == '0':
339 continue
340 video_id = mobj.group('id')
341 video_title = unescapeHTML(
342 mobj.group('title')) if 'title' in mobj.groupdict() else None
343 if video_title:
344 video_title = video_title.strip()
345 if video_title == '► Play all':
346 video_title = None
347 try:
348 idx = ids_in_page.index(video_id)
349 if video_title and not titles_in_page[idx]:
350 titles_in_page[idx] = video_title
351 except ValueError:
352 ids_in_page.append(video_id)
353 titles_in_page.append(video_title)
354
355 def extract_videos_from_page(self, page):
356 ids_in_page = []
357 titles_in_page = []
358 self.extract_videos_from_page_impl(
359 self._VIDEO_RE, page, ids_in_page, titles_in_page)
360 return zip(ids_in_page, titles_in_page)
361
362
363 class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
364 def _process_page(self, content):
365 for playlist_id in orderedSet(re.findall(
366 r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',
367 content)):
368 yield self.url_result(
369 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
370
371 def _real_extract(self, url):
372 playlist_id = self._match_id(url)
373 webpage = self._download_webpage(url, playlist_id)
374 title = self._og_search_title(webpage, fatal=False)
375 return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
376
377
378 class YoutubeIE(YoutubeBaseInfoExtractor):
379 IE_DESC = 'YouTube.com'
380 _VALID_URL = r"""(?x)^
381 (
382 (?:https?://|//) # http(s):// or protocol-independent URL
383 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
384 (?:www\.)?deturl\.com/www\.youtube\.com/|
385 (?:www\.)?pwnyoutube\.com/|
386 (?:www\.)?hooktube\.com/|
387 (?:www\.)?yourepeat\.com/|
388 tube\.majestyc\.net/|
389 # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
390 (?:(?:www|dev)\.)?invidio\.us/|
391 (?:(?:www|no)\.)?invidiou\.sh/|
392 (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/|
393 (?:www\.)?invidious\.kabi\.tk/|
394 (?:www\.)?invidious\.13ad\.de/|
395 (?:www\.)?invidious\.mastodon\.host/|
396 (?:www\.)?invidious\.nixnet\.xyz/|
397 (?:www\.)?invidious\.drycat\.fr/|
398 (?:www\.)?tube\.poal\.co/|
399 (?:www\.)?vid\.wxzm\.sx/|
400 (?:www\.)?yewtu\.be/|
401 (?:www\.)?yt\.elukerio\.org/|
402 (?:www\.)?yt\.lelux\.fi/|
403 (?:www\.)?invidious\.ggc-project\.de/|
404 (?:www\.)?yt\.maisputain\.ovh/|
405 (?:www\.)?invidious\.13ad\.de/|
406 (?:www\.)?invidious\.toot\.koeln/|
407 (?:www\.)?invidious\.fdn\.fr/|
408 (?:www\.)?watch\.nettohikari\.com/|
409 (?:www\.)?kgg2m7yk5aybusll\.onion/|
410 (?:www\.)?qklhadlycap4cnod\.onion/|
411 (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
412 (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
413 (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
414 (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
415 (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
416 (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
417 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
418 (?:.*?\#/)? # handle anchor (#/) redirect urls
419 (?: # the various things that can precede the ID:
420 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
421 |(?: # or the v= param in all its forms
422 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
423 (?:\?|\#!?) # the params delimiter ? or # or #!
424 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
425 v=
426 )
427 ))
428 |(?:
429 youtu\.be| # just youtu.be/xxxx
430 vid\.plus| # or vid.plus/xxxx
431 zwearz\.com/watch| # or zwearz.com/watch/xxxx
432 )/
433 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
434 )
435 )? # all until now is optional -> you can pass the naked ID
436 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
437 (?!.*?\blist=
438 (?:
439 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
440 WL # WL are handled by the watch later IE
441 )
442 )
443 (?(1).+)? # if we found the ID, everything can follow
444 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
445 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
446 _PLAYER_INFO_RE = (
447 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P<ext>[a-z]+)$',
448 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.(?P<ext>[a-z]+)$',
449 )
450 _formats = {
451 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
452 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
453 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
454 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
455 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
456 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
457 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
458 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
459 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
460 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
461 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
462 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
463 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
464 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
465 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
466 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
467 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
468 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
469
470
471 # 3D videos
472 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
473 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
474 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
475 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
476 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
477 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
478 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
479
480 # Apple HTTP Live Streaming
481 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
482 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
483 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
484 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
485 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
486 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
487 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
488 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
489
490 # DASH mp4 video
491 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
492 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
493 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
494 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
495 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
496 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
497 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
498 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
499 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
500 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
501 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
502 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
503
504 # Dash mp4 audio
505 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
506 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
507 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
508 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
509 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
510 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
511 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
512
513 # Dash webm
514 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
515 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
516 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
517 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
518 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
519 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
520 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
521 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
522 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
523 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
524 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
525 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
526 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
527 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
528 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
529 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
530 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
531 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
532 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
533 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
534 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
535 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
536
537 # Dash webm audio
538 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
539 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
540
541 # Dash webm audio with opus inside
542 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
543 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
544 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
545
546 # RTMP (unnamed)
547 '_rtmp': {'protocol': 'rtmp'},
548
549 # av01 video only formats sometimes served with "unknown" codecs
550 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
551 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
552 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
553 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
554 }
555 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
556
557 _GEO_BYPASS = False
558
559 IE_NAME = 'youtube'
560 _TESTS = [
561 {
562 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
563 'info_dict': {
564 'id': 'BaW_jenozKc',
565 'ext': 'mp4',
566 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
567 'uploader': 'Philipp Hagemeister',
568 'uploader_id': 'phihag',
569 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
570 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
571 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
572 'upload_date': '20121002',
573 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
574 'categories': ['Science & Technology'],
575 'tags': ['youtube-dl'],
576 'duration': 10,
577 'view_count': int,
578 'like_count': int,
579 'dislike_count': int,
580 'start_time': 1,
581 'end_time': 9,
582 }
583 },
584 {
585 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
586 'note': 'Embed-only video (#1746)',
587 'info_dict': {
588 'id': 'yZIXLfi8CZQ',
589 'ext': 'mp4',
590 'upload_date': '20120608',
591 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
592 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
593 'uploader': 'SET India',
594 'uploader_id': 'setindia',
595 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
596 'age_limit': 18,
597 }
598 },
599 {
600 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
601 'note': 'Use the first video ID in the URL',
602 'info_dict': {
603 'id': 'BaW_jenozKc',
604 'ext': 'mp4',
605 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
606 'uploader': 'Philipp Hagemeister',
607 'uploader_id': 'phihag',
608 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
609 'upload_date': '20121002',
610 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
611 'categories': ['Science & Technology'],
612 'tags': ['youtube-dl'],
613 'duration': 10,
614 'view_count': int,
615 'like_count': int,
616 'dislike_count': int,
617 },
618 'params': {
619 'skip_download': True,
620 },
621 },
622 {
623 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
624 'note': '256k DASH audio (format 141) via DASH manifest',
625 'info_dict': {
626 'id': 'a9LDPn-MO4I',
627 'ext': 'm4a',
628 'upload_date': '20121002',
629 'uploader_id': '8KVIDEO',
630 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
631 'description': '',
632 'uploader': '8KVIDEO',
633 'title': 'UHDTV TEST 8K VIDEO.mp4'
634 },
635 'params': {
636 'youtube_include_dash_manifest': True,
637 'format': '141',
638 },
639 'skip': 'format 141 not served anymore',
640 },
641 # Controversy video
642 {
643 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
644 'info_dict': {
645 'id': 'T4XJQO3qol8',
646 'ext': 'mp4',
647 'duration': 219,
648 'upload_date': '20100909',
649 'uploader': 'Amazing Atheist',
650 'uploader_id': 'TheAmazingAtheist',
651 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
652 'title': 'Burning Everyone\'s Koran',
653 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
654 }
655 },
656 # Normal age-gate video (embed allowed)
657 {
658 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
659 'info_dict': {
660 'id': 'HtVdAasjOgU',
661 'ext': 'mp4',
662 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
663 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
664 'duration': 142,
665 'uploader': 'The Witcher',
666 'uploader_id': 'WitcherGame',
667 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
668 'upload_date': '20140605',
669 'age_limit': 18,
670 },
671 },
672 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
673 {
674 'url': 'lqQg6PlCWgI',
675 'info_dict': {
676 'id': 'lqQg6PlCWgI',
677 'ext': 'mp4',
678 'duration': 6085,
679 'upload_date': '20150827',
680 'uploader_id': 'olympic',
681 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
682 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
683 'uploader': 'Olympic',
684 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
685 },
686 'params': {
687 'skip_download': 'requires avconv',
688 }
689 },
690 # Non-square pixels
691 {
692 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
693 'info_dict': {
694 'id': '_b-2C3KPAM0',
695 'ext': 'mp4',
696 'stretched_ratio': 16 / 9.,
697 'duration': 85,
698 'upload_date': '20110310',
699 'uploader_id': 'AllenMeow',
700 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
701 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
702 'uploader': '孫ᄋᄅ',
703 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
704 },
705 },
706 # url_encoded_fmt_stream_map is empty string
707 {
708 'url': 'qEJwOuvDf7I',
709 'info_dict': {
710 'id': 'qEJwOuvDf7I',
711 'ext': 'webm',
712 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
713 'description': '',
714 'upload_date': '20150404',
715 'uploader_id': 'spbelect',
716 'uploader': 'Наблюдатели Петербурга',
717 },
718 'params': {
719 'skip_download': 'requires avconv',
720 },
721 'skip': 'This live event has ended.',
722 },
723 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
724 {
725 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
726 'info_dict': {
727 'id': 'FIl7x6_3R5Y',
728 'ext': 'webm',
729 'title': 'md5:7b81415841e02ecd4313668cde88737a',
730 'description': 'md5:116377fd2963b81ec4ce64b542173306',
731 'duration': 220,
732 'upload_date': '20150625',
733 'uploader_id': 'dorappi2000',
734 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
735 'uploader': 'dorappi2000',
736 'formats': 'mincount:31',
737 },
738 'skip': 'not actual anymore',
739 },
740 # DASH manifest with segment_list
741 {
742 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
743 'md5': '8ce563a1d667b599d21064e982ab9e31',
744 'info_dict': {
745 'id': 'CsmdDsKjzN8',
746 'ext': 'mp4',
747 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
748 'uploader': 'Airtek',
749 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
750 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
751 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
752 },
753 'params': {
754 'youtube_include_dash_manifest': True,
755 'format': '135', # bestvideo
756 },
757 'skip': 'This live event has ended.',
758 },
759 {
760 # Multifeed videos (multiple cameras), URL is for Main Camera
761 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
762 'info_dict': {
763 'id': 'jqWvoWXjCVs',
764 'title': 'teamPGP: Rocket League Noob Stream',
765 'description': 'md5:dc7872fb300e143831327f1bae3af010',
766 },
767 'playlist': [{
768 'info_dict': {
769 'id': 'jqWvoWXjCVs',
770 'ext': 'mp4',
771 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
772 'description': 'md5:dc7872fb300e143831327f1bae3af010',
773 'duration': 7335,
774 'upload_date': '20150721',
775 'uploader': 'Beer Games Beer',
776 'uploader_id': 'beergamesbeer',
777 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
778 'license': 'Standard YouTube License',
779 },
780 }, {
781 'info_dict': {
782 'id': '6h8e8xoXJzg',
783 'ext': 'mp4',
784 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
785 'description': 'md5:dc7872fb300e143831327f1bae3af010',
786 'duration': 7337,
787 'upload_date': '20150721',
788 'uploader': 'Beer Games Beer',
789 'uploader_id': 'beergamesbeer',
790 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
791 'license': 'Standard YouTube License',
792 },
793 }, {
794 'info_dict': {
795 'id': 'PUOgX5z9xZw',
796 'ext': 'mp4',
797 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
798 'description': 'md5:dc7872fb300e143831327f1bae3af010',
799 'duration': 7337,
800 'upload_date': '20150721',
801 'uploader': 'Beer Games Beer',
802 'uploader_id': 'beergamesbeer',
803 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
804 'license': 'Standard YouTube License',
805 },
806 }, {
807 'info_dict': {
808 'id': 'teuwxikvS5k',
809 'ext': 'mp4',
810 'title': 'teamPGP: Rocket League Noob Stream (zim)',
811 'description': 'md5:dc7872fb300e143831327f1bae3af010',
812 'duration': 7334,
813 'upload_date': '20150721',
814 'uploader': 'Beer Games Beer',
815 'uploader_id': 'beergamesbeer',
816 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
817 'license': 'Standard YouTube License',
818 },
819 }],
820 'params': {
821 'skip_download': True,
822 },
823 'skip': 'This video is not available.',
824 },
825 {
826 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
827 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
828 'info_dict': {
829 'id': 'gVfLd0zydlo',
830 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
831 },
832 'playlist_count': 2,
833 'skip': 'Not multifeed anymore',
834 },
835 {
836 'url': 'https://vid.plus/FlRa-iH7PGw',
837 'only_matching': True,
838 },
839 {
840 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
841 'only_matching': True,
842 },
843 {
844 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
845 # Also tests cut-off URL expansion in video description (see
846 # https://github.com/ytdl-org/youtube-dl/issues/1892,
847 # https://github.com/ytdl-org/youtube-dl/issues/8164)
848 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
849 'info_dict': {
850 'id': 'lsguqyKfVQg',
851 'ext': 'mp4',
852 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
853 'alt_title': 'Dark Walk - Position Music',
854 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
855 'duration': 133,
856 'upload_date': '20151119',
857 'uploader_id': 'IronSoulElf',
858 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
859 'uploader': 'IronSoulElf',
860 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
861 'track': 'Dark Walk - Position Music',
862 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
863 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
864 },
865 'params': {
866 'skip_download': True,
867 },
868 },
869 {
870 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
871 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
872 'only_matching': True,
873 },
874 {
875 # Video with yt:stretch=17:0
876 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
877 'info_dict': {
878 'id': 'Q39EVAstoRM',
879 'ext': 'mp4',
880 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
881 'description': 'md5:ee18a25c350637c8faff806845bddee9',
882 'upload_date': '20151107',
883 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
884 'uploader': 'CH GAMER DROID',
885 },
886 'params': {
887 'skip_download': True,
888 },
889 'skip': 'This video does not exist.',
890 },
891 {
892 # Video licensed under Creative Commons
893 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
894 'info_dict': {
895 'id': 'M4gD1WSo5mA',
896 'ext': 'mp4',
897 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
898 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
899 'duration': 721,
900 'upload_date': '20150127',
901 'uploader_id': 'BerkmanCenter',
902 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
903 'uploader': 'The Berkman Klein Center for Internet & Society',
904 'license': 'Creative Commons Attribution license (reuse allowed)',
905 },
906 'params': {
907 'skip_download': True,
908 },
909 },
910 {
911 # Channel-like uploader_url
912 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
913 'info_dict': {
914 'id': 'eQcmzGIKrzg',
915 'ext': 'mp4',
916 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
917 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
918 'duration': 4060,
919 'upload_date': '20151119',
920 'uploader': 'Bernie Sanders',
921 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
922 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
923 'license': 'Creative Commons Attribution license (reuse allowed)',
924 },
925 'params': {
926 'skip_download': True,
927 },
928 },
929 {
930 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
931 'only_matching': True,
932 },
933 {
934 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
935 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
936 'only_matching': True,
937 },
938 {
939 # Rental video preview
940 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
941 'info_dict': {
942 'id': 'uGpuVWrhIzE',
943 'ext': 'mp4',
944 'title': 'Piku - Trailer',
945 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
946 'upload_date': '20150811',
947 'uploader': 'FlixMatrix',
948 'uploader_id': 'FlixMatrixKaravan',
949 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
950 'license': 'Standard YouTube License',
951 },
952 'params': {
953 'skip_download': True,
954 },
955 'skip': 'This video is not available.',
956 },
957 {
958 # YouTube Red video with episode data
959 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
960 'info_dict': {
961 'id': 'iqKdEhx-dD4',
962 'ext': 'mp4',
963 'title': 'Isolation - Mind Field (Ep 1)',
964 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
965 'duration': 2085,
966 'upload_date': '20170118',
967 'uploader': 'Vsauce',
968 'uploader_id': 'Vsauce',
969 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
970 'series': 'Mind Field',
971 'season_number': 1,
972 'episode_number': 1,
973 },
974 'params': {
975 'skip_download': True,
976 },
977 'expected_warnings': [
978 'Skipping DASH manifest',
979 ],
980 },
981 {
982 # The following content has been identified by the YouTube community
983 # as inappropriate or offensive to some audiences.
984 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
985 'info_dict': {
986 'id': '6SJNVb0GnPI',
987 'ext': 'mp4',
988 'title': 'Race Differences in Intelligence',
989 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
990 'duration': 965,
991 'upload_date': '20140124',
992 'uploader': 'New Century Foundation',
993 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
994 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
995 },
996 'params': {
997 'skip_download': True,
998 },
999 },
1000 {
1001 # itag 212
1002 'url': '1t24XAntNCY',
1003 'only_matching': True,
1004 },
1005 {
1006 # geo restricted to JP
1007 'url': 'sJL6WA-aGkQ',
1008 'only_matching': True,
1009 },
1010 {
1011 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
1012 'only_matching': True,
1013 },
1014 {
1015 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1016 'only_matching': True,
1017 },
1018 {
1019 # DRM protected
1020 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1021 'only_matching': True,
1022 },
1023 {
1024 # Video with unsupported adaptive stream type formats
1025 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1026 'info_dict': {
1027 'id': 'Z4Vy8R84T1U',
1028 'ext': 'mp4',
1029 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1030 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1031 'duration': 433,
1032 'upload_date': '20130923',
1033 'uploader': 'Amelia Putri Harwita',
1034 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1035 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1036 'formats': 'maxcount:10',
1037 },
1038 'params': {
1039 'skip_download': True,
1040 'youtube_include_dash_manifest': False,
1041 },
1042 'skip': 'not actual anymore',
1043 },
1044 {
1045 # Youtube Music Auto-generated description
1046 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1047 'info_dict': {
1048 'id': 'MgNrAu2pzNs',
1049 'ext': 'mp4',
1050 'title': 'Voyeur Girl',
1051 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1052 'upload_date': '20190312',
1053 'uploader': 'Stephen - Topic',
1054 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
1055 'artist': 'Stephen',
1056 'track': 'Voyeur Girl',
1057 'album': 'it\'s too much love to know my dear',
1058 'release_date': '20190313',
1059 'release_year': 2019,
1060 },
1061 'params': {
1062 'skip_download': True,
1063 },
1064 },
1065 {
1066 # Youtube Music Auto-generated description
1067 # Retrieve 'artist' field from 'Artist:' in video description
1068 # when it is present on youtube music video
1069 'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY',
1070 'info_dict': {
1071 'id': 'k0jLE7tTwjY',
1072 'ext': 'mp4',
1073 'title': 'Latch Feat. Sam Smith',
1074 'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335',
1075 'upload_date': '20150110',
1076 'uploader': 'Various Artists - Topic',
1077 'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w',
1078 'artist': 'Disclosure',
1079 'track': 'Latch Feat. Sam Smith',
1080 'album': 'Latch Featuring Sam Smith',
1081 'release_date': '20121008',
1082 'release_year': 2012,
1083 },
1084 'params': {
1085 'skip_download': True,
1086 },
1087 },
1088 {
1089 # Youtube Music Auto-generated description
1090 # handle multiple artists on youtube music video
1091 'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA',
1092 'info_dict': {
1093 'id': '74qn0eJSjpA',
1094 'ext': 'mp4',
1095 'title': 'Eastside',
1096 'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2',
1097 'upload_date': '20180710',
1098 'uploader': 'Benny Blanco - Topic',
1099 'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A',
1100 'artist': 'benny blanco, Halsey, Khalid',
1101 'track': 'Eastside',
1102 'album': 'Eastside',
1103 'release_date': '20180713',
1104 'release_year': 2018,
1105 },
1106 'params': {
1107 'skip_download': True,
1108 },
1109 },
1110 {
1111 # Youtube Music Auto-generated description
1112 # handle youtube music video with release_year and no release_date
1113 'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M',
1114 'info_dict': {
1115 'id': '-hcAI0g-f5M',
1116 'ext': 'mp4',
1117 'title': 'Put It On Me',
1118 'description': 'md5:f6422397c07c4c907c6638e1fee380a5',
1119 'upload_date': '20180426',
1120 'uploader': 'Matt Maeson - Topic',
1121 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ',
1122 'artist': 'Matt Maeson',
1123 'track': 'Put It On Me',
1124 'album': 'The Hearse',
1125 'release_date': None,
1126 'release_year': 2018,
1127 },
1128 'params': {
1129 'skip_download': True,
1130 },
1131 },
1132 {
1133 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1134 'only_matching': True,
1135 },
1136 {
1137 # invalid -> valid video id redirection
1138 'url': 'DJztXj2GPfl',
1139 'info_dict': {
1140 'id': 'DJztXj2GPfk',
1141 'ext': 'mp4',
1142 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1143 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1144 'upload_date': '20090125',
1145 'uploader': 'Prochorowka',
1146 'uploader_id': 'Prochorowka',
1147 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1148 'artist': 'Panjabi MC',
1149 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1150 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1151 },
1152 'params': {
1153 'skip_download': True,
1154 },
1155 },
1156 {
1157 # empty description results in an empty string
1158 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1159 'info_dict': {
1160 'id': 'x41yOUIvK2k',
1161 'ext': 'mp4',
1162 'title': 'IMG 3456',
1163 'description': '',
1164 'upload_date': '20170613',
1165 'uploader_id': 'ElevageOrVert',
1166 'uploader': 'ElevageOrVert',
1167 },
1168 'params': {
1169 'skip_download': True,
1170 },
1171 },
1172 ]
1173
1174 def __init__(self, *args, **kwargs):
1175 super(YoutubeIE, self).__init__(*args, **kwargs)
1176 self._player_cache = {}
1177
1178 def report_video_info_webpage_download(self, video_id):
1179 """Report attempt to download video info webpage."""
1180 self.to_screen('%s: Downloading video info webpage' % video_id)
1181
1182 def report_information_extraction(self, video_id):
1183 """Report attempt to extract video information."""
1184 self.to_screen('%s: Extracting video information' % video_id)
1185
1186 def report_unavailable_format(self, video_id, format):
1187 """Report extracted video URL."""
1188 self.to_screen('%s: Format %s not available' % (video_id, format))
1189
1190 def report_rtmp_download(self):
1191 """Indicate the download will use the RTMP protocol."""
1192 self.to_screen('RTMP download detected')
1193
1194 def _signature_cache_id(self, example_sig):
1195 """ Return a string representation of a signature """
1196 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
1197
1198 @classmethod
1199 def _extract_player_info(cls, player_url):
1200 for player_re in cls._PLAYER_INFO_RE:
1201 id_m = re.search(player_re, player_url)
1202 if id_m:
1203 break
1204 else:
1205 raise ExtractorError('Cannot identify player %r' % player_url)
1206 return id_m.group('ext'), id_m.group('id')
1207
1208 def _extract_signature_function(self, video_id, player_url, example_sig):
1209 player_type, player_id = self._extract_player_info(player_url)
1210
1211 # Read from filesystem cache
1212 func_id = '%s_%s_%s' % (
1213 player_type, player_id, self._signature_cache_id(example_sig))
1214 assert os.path.basename(func_id) == func_id
1215
1216 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
1217 if cache_spec is not None:
1218 return lambda s: ''.join(s[i] for i in cache_spec)
1219
1220 download_note = (
1221 'Downloading player %s' % player_url
1222 if self._downloader.params.get('verbose') else
1223 'Downloading %s player %s' % (player_type, player_id)
1224 )
1225 if player_type == 'js':
1226 code = self._download_webpage(
1227 player_url, video_id,
1228 note=download_note,
1229 errnote='Download of %s failed' % player_url)
1230 res = self._parse_sig_js(code)
1231 elif player_type == 'swf':
1232 urlh = self._request_webpage(
1233 player_url, video_id,
1234 note=download_note,
1235 errnote='Download of %s failed' % player_url)
1236 code = urlh.read()
1237 res = self._parse_sig_swf(code)
1238 else:
1239 assert False, 'Invalid player type %r' % player_type
1240
1241 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1242 cache_res = res(test_string)
1243 cache_spec = [ord(c) for c in cache_res]
1244
1245 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
1246 return res
1247
1248 def _print_sig_code(self, func, example_sig):
1249 def gen_sig_code(idxs):
1250 def _genslice(start, end, step):
1251 starts = '' if start == 0 else str(start)
1252 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
1253 steps = '' if step == 1 else (':%d' % step)
1254 return 's[%s%s%s]' % (starts, ends, steps)
1255
1256 step = None
1257 # Quelch pyflakes warnings - start will be set when step is set
1258 start = '(Never used)'
1259 for i, prev in zip(idxs[1:], idxs[:-1]):
1260 if step is not None:
1261 if i - prev == step:
1262 continue
1263 yield _genslice(start, prev, step)
1264 step = None
1265 continue
1266 if i - prev in [-1, 1]:
1267 step = i - prev
1268 start = prev
1269 continue
1270 else:
1271 yield 's[%d]' % prev
1272 if step is None:
1273 yield 's[%d]' % i
1274 else:
1275 yield _genslice(start, i, step)
1276
1277 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1278 cache_res = func(test_string)
1279 cache_spec = [ord(c) for c in cache_res]
1280 expr_code = ' + '.join(gen_sig_code(cache_spec))
1281 signature_id_tuple = '(%s)' % (
1282 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
1283 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
1284 ' return %s\n') % (signature_id_tuple, expr_code)
1285 self.to_screen('Extracted signature function:\n' + code)
1286
1287 def _parse_sig_js(self, jscode):
1288 funcname = self._search_regex(
1289 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1290 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1291 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1292 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1293 # Obsolete patterns
1294 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1295 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
1296 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1297 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1298 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1299 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1300 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1301 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
1302 jscode, 'Initial JS player signature function name', group='sig')
1303
1304 jsi = JSInterpreter(jscode)
1305 initial_function = jsi.extract_function(funcname)
1306 return lambda s: initial_function([s])
1307
1308 def _parse_sig_swf(self, file_contents):
1309 swfi = SWFInterpreter(file_contents)
1310 TARGET_CLASSNAME = 'SignatureDecipher'
1311 searched_class = swfi.extract_class(TARGET_CLASSNAME)
1312 initial_function = swfi.extract_function(searched_class, 'decipher')
1313 return lambda s: initial_function([s])
1314
1315 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1316 """Turn the encrypted s field into a working signature"""
1317
1318 if player_url is None:
1319 raise ExtractorError('Cannot decrypt signature without player_url')
1320
1321 if player_url.startswith('//'):
1322 player_url = 'https:' + player_url
1323 elif not re.match(r'https?://', player_url):
1324 player_url = compat_urlparse.urljoin(
1325 'https://www.youtube.com', player_url)
1326 try:
1327 player_id = (player_url, self._signature_cache_id(s))
1328 if player_id not in self._player_cache:
1329 func = self._extract_signature_function(
1330 video_id, player_url, s
1331 )
1332 self._player_cache[player_id] = func
1333 func = self._player_cache[player_id]
1334 if self._downloader.params.get('youtube_print_sig_code'):
1335 self._print_sig_code(func, s)
1336 return func(s)
1337 except Exception as e:
1338 tb = traceback.format_exc()
1339 raise ExtractorError(
1340 'Signature extraction failed: ' + tb, cause=e)
1341
1342 def _get_subtitles(self, video_id, webpage, has_live_chat_replay):
1343 try:
1344 subs_doc = self._download_xml(
1345 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1346 video_id, note=False)
1347 except ExtractorError as err:
1348 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
1349 return {}
1350
1351 sub_lang_list = {}
1352 for track in subs_doc.findall('track'):
1353 lang = track.attrib['lang_code']
1354 if lang in sub_lang_list:
1355 continue
1356 sub_formats = []
1357 for ext in self._SUBTITLE_FORMATS:
1358 params = compat_urllib_parse_urlencode({
1359 'lang': lang,
1360 'v': video_id,
1361 'fmt': ext,
1362 'name': track.attrib['name'].encode('utf-8'),
1363 })
1364 sub_formats.append({
1365 'url': 'https://www.youtube.com/api/timedtext?' + params,
1366 'ext': ext,
1367 })
1368 sub_lang_list[lang] = sub_formats
1369 if has_live_chat_replay:
1370 sub_lang_list['live_chat'] = [
1371 {
1372 'video_id': video_id,
1373 'ext': 'json',
1374 'protocol': 'youtube_live_chat_replay',
1375 },
1376 ]
1377 if not sub_lang_list:
1378 self._downloader.report_warning('video doesn\'t have subtitles')
1379 return {}
1380 return sub_lang_list
1381
1382 def _get_ytplayer_config(self, video_id, webpage):
1383 patterns = (
1384 # User data may contain arbitrary character sequences that may affect
1385 # JSON extraction with regex, e.g. when '};' is contained the second
1386 # regex won't capture the whole JSON. Yet working around by trying more
1387 # concrete regex first keeping in mind proper quoted string handling
1388 # to be implemented in future that will replace this workaround (see
1389 # https://github.com/ytdl-org/youtube-dl/issues/7468,
1390 # https://github.com/ytdl-org/youtube-dl/pull/7599)
1391 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1392 r';ytplayer\.config\s*=\s*({.+?});',
1393 )
1394 config = self._search_regex(
1395 patterns, webpage, 'ytplayer.config', default=None)
1396 if config:
1397 return self._parse_json(
1398 uppercase_escape(config), video_id, fatal=False)
1399
1400 def _get_yt_initial_data(self, video_id, webpage):
1401 config = self._search_regex(
1402 (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
1403 r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
1404 webpage, 'ytInitialData', default=None)
1405 if config:
1406 return self._parse_json(
1407 uppercase_escape(config), video_id, fatal=False)
1408
1409 def _get_automatic_captions(self, video_id, webpage):
1410 """We need the webpage for getting the captions url, pass it as an
1411 argument to speed up the process."""
1412 self.to_screen('%s: Looking for automatic captions' % video_id)
1413 player_config = self._get_ytplayer_config(video_id, webpage)
1414 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
1415 if not player_config:
1416 self._downloader.report_warning(err_msg)
1417 return {}
1418 try:
1419 args = player_config['args']
1420 caption_url = args.get('ttsurl')
1421 if caption_url:
1422 timestamp = args['timestamp']
1423 # We get the available subtitles
1424 list_params = compat_urllib_parse_urlencode({
1425 'type': 'list',
1426 'tlangs': 1,
1427 'asrs': 1,
1428 })
1429 list_url = caption_url + '&' + list_params
1430 caption_list = self._download_xml(list_url, video_id)
1431 original_lang_node = caption_list.find('track')
1432 if original_lang_node is None:
1433 self._downloader.report_warning('Video doesn\'t have automatic captions')
1434 return {}
1435 original_lang = original_lang_node.attrib['lang_code']
1436 caption_kind = original_lang_node.attrib.get('kind', '')
1437
1438 sub_lang_list = {}
1439 for lang_node in caption_list.findall('target'):
1440 sub_lang = lang_node.attrib['lang_code']
1441 sub_formats = []
1442 for ext in self._SUBTITLE_FORMATS:
1443 params = compat_urllib_parse_urlencode({
1444 'lang': original_lang,
1445 'tlang': sub_lang,
1446 'fmt': ext,
1447 'ts': timestamp,
1448 'kind': caption_kind,
1449 })
1450 sub_formats.append({
1451 'url': caption_url + '&' + params,
1452 'ext': ext,
1453 })
1454 sub_lang_list[sub_lang] = sub_formats
1455 return sub_lang_list
1456
1457 def make_captions(sub_url, sub_langs):
1458 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1459 caption_qs = compat_parse_qs(parsed_sub_url.query)
1460 captions = {}
1461 for sub_lang in sub_langs:
1462 sub_formats = []
1463 for ext in self._SUBTITLE_FORMATS:
1464 caption_qs.update({
1465 'tlang': [sub_lang],
1466 'fmt': [ext],
1467 })
1468 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1469 query=compat_urllib_parse_urlencode(caption_qs, True)))
1470 sub_formats.append({
1471 'url': sub_url,
1472 'ext': ext,
1473 })
1474 captions[sub_lang] = sub_formats
1475 return captions
1476
1477 # New captions format as of 22.06.2017
1478 player_response = args.get('player_response')
1479 if player_response and isinstance(player_response, compat_str):
1480 player_response = self._parse_json(
1481 player_response, video_id, fatal=False)
1482 if player_response:
1483 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1484 caption_tracks = renderer['captionTracks']
1485 for caption_track in caption_tracks:
1486 if 'kind' not in caption_track:
1487 # not an automatic transcription
1488 continue
1489 base_url = caption_track['baseUrl']
1490 sub_lang_list = []
1491 for lang in renderer['translationLanguages']:
1492 lang_code = lang.get('languageCode')
1493 if lang_code:
1494 sub_lang_list.append(lang_code)
1495 return make_captions(base_url, sub_lang_list)
1496
1497 self._downloader.report_warning("Couldn't find automatic captions for %s" % video_id)
1498 return {}
1499 # Some videos don't provide ttsurl but rather caption_tracks and
1500 # caption_translation_languages (e.g. 20LmZk1hakA)
1501 # Does not used anymore as of 22.06.2017
1502 caption_tracks = args['caption_tracks']
1503 caption_translation_languages = args['caption_translation_languages']
1504 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
1505 sub_lang_list = []
1506 for lang in caption_translation_languages.split(','):
1507 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1508 sub_lang = lang_qs.get('lc', [None])[0]
1509 if sub_lang:
1510 sub_lang_list.append(sub_lang)
1511 return make_captions(caption_url, sub_lang_list)
1512 # An extractor error can be raise by the download process if there are
1513 # no automatic captions but there are subtitles
1514 except (KeyError, IndexError, ExtractorError):
1515 self._downloader.report_warning(err_msg)
1516 return {}
1517
1518 def _mark_watched(self, video_id, video_info, player_response):
1519 playback_url = url_or_none(try_get(
1520 player_response,
1521 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
1522 video_info, lambda x: x['videostats_playback_base_url'][0]))
1523 if not playback_url:
1524 return
1525 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1526 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1527
1528 # cpn generation algorithm is reverse engineered from base.js.
1529 # In fact it works even with dummy cpn.
1530 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1531 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1532
1533 qs.update({
1534 'ver': ['2'],
1535 'cpn': [cpn],
1536 })
1537 playback_url = compat_urlparse.urlunparse(
1538 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
1539
1540 self._download_webpage(
1541 playback_url, video_id, 'Marking watched',
1542 'Unable to mark watched', fatal=False)
1543
1544 @staticmethod
1545 def _extract_urls(webpage):
1546 # Embedded YouTube player
1547 entries = [
1548 unescapeHTML(mobj.group('url'))
1549 for mobj in re.finditer(r'''(?x)
1550 (?:
1551 <iframe[^>]+?src=|
1552 data-video-url=|
1553 <embed[^>]+?src=|
1554 embedSWF\(?:\s*|
1555 <object[^>]+data=|
1556 new\s+SWFObject\(
1557 )
1558 (["\'])
1559 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1560 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
1561 \1''', webpage)]
1562
1563 # lazyYT YouTube embed
1564 entries.extend(list(map(
1565 unescapeHTML,
1566 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1567
1568 # Wordpress "YouTube Video Importer" plugin
1569 matches = re.findall(r'''(?x)<div[^>]+
1570 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1571 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1572 entries.extend(m[-1] for m in matches)
1573
1574 return entries
1575
1576 @staticmethod
1577 def _extract_url(webpage):
1578 urls = YoutubeIE._extract_urls(webpage)
1579 return urls[0] if urls else None
1580
1581 @classmethod
1582 def extract_id(cls, url):
1583 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1584 if mobj is None:
1585 raise ExtractorError('Invalid URL: %s' % url)
1586 video_id = mobj.group(2)
1587 return video_id
1588
1589 def _extract_chapters_from_json(self, webpage, video_id, duration):
1590 if not webpage:
1591 return
1592 initial_data = self._parse_json(
1593 self._search_regex(
1594 r'window\["ytInitialData"\] = (.+);\n', webpage,
1595 'player args', default='{}'),
1596 video_id, fatal=False)
1597 if not initial_data or not isinstance(initial_data, dict):
1598 return
1599 chapters_list = try_get(
1600 initial_data,
1601 lambda x: x['playerOverlays']
1602 ['playerOverlayRenderer']
1603 ['decoratedPlayerBarRenderer']
1604 ['decoratedPlayerBarRenderer']
1605 ['playerBar']
1606 ['chapteredPlayerBarRenderer']
1607 ['chapters'],
1608 list)
1609 if not chapters_list:
1610 return
1611
1612 def chapter_time(chapter):
1613 return float_or_none(
1614 try_get(
1615 chapter,
1616 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1617 int),
1618 scale=1000)
1619 chapters = []
1620 for next_num, chapter in enumerate(chapters_list, start=1):
1621 start_time = chapter_time(chapter)
1622 if start_time is None:
1623 continue
1624 end_time = (chapter_time(chapters_list[next_num])
1625 if next_num < len(chapters_list) else duration)
1626 if end_time is None:
1627 continue
1628 title = try_get(
1629 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1630 compat_str)
1631 chapters.append({
1632 'start_time': start_time,
1633 'end_time': end_time,
1634 'title': title,
1635 })
1636 return chapters
1637
1638 @staticmethod
1639 def _extract_chapters_from_description(description, duration):
1640 if not description:
1641 return None
1642 chapter_lines = re.findall(
1643 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1644 description)
1645 if not chapter_lines:
1646 return None
1647 chapters = []
1648 for next_num, (chapter_line, time_point) in enumerate(
1649 chapter_lines, start=1):
1650 start_time = parse_duration(time_point)
1651 if start_time is None:
1652 continue
1653 if start_time > duration:
1654 break
1655 end_time = (duration if next_num == len(chapter_lines)
1656 else parse_duration(chapter_lines[next_num][1]))
1657 if end_time is None:
1658 continue
1659 if end_time > duration:
1660 end_time = duration
1661 if start_time > end_time:
1662 break
1663 chapter_title = re.sub(
1664 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1665 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1666 chapters.append({
1667 'start_time': start_time,
1668 'end_time': end_time,
1669 'title': chapter_title,
1670 })
1671 return chapters
1672
1673 def _extract_chapters(self, webpage, description, video_id, duration):
1674 return (self._extract_chapters_from_json(webpage, video_id, duration)
1675 or self._extract_chapters_from_description(description, duration))
1676
1677 def _real_extract(self, url):
1678 url, smuggled_data = unsmuggle_url(url, {})
1679
1680 proto = (
1681 'http' if self._downloader.params.get('prefer_insecure', False)
1682 else 'https')
1683
1684 start_time = None
1685 end_time = None
1686 parsed_url = compat_urllib_parse_urlparse(url)
1687 for component in [parsed_url.fragment, parsed_url.query]:
1688 query = compat_parse_qs(component)
1689 if start_time is None and 't' in query:
1690 start_time = parse_duration(query['t'][0])
1691 if start_time is None and 'start' in query:
1692 start_time = parse_duration(query['start'][0])
1693 if end_time is None and 'end' in query:
1694 end_time = parse_duration(query['end'][0])
1695
1696 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1697 mobj = re.search(self._NEXT_URL_RE, url)
1698 if mobj:
1699 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
1700 video_id = self.extract_id(url)
1701
1702 # Get video webpage
1703 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
1704 video_webpage, urlh = self._download_webpage_handle(url, video_id)
1705
1706 qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query)
1707 video_id = qs.get('v', [None])[0] or video_id
1708
1709 # Attempt to extract SWF player URL
1710 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1711 if mobj is not None:
1712 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1713 else:
1714 player_url = None
1715
1716 dash_mpds = []
1717
1718 def add_dash_mpd(video_info):
1719 dash_mpd = video_info.get('dashmpd')
1720 if dash_mpd and dash_mpd[0] not in dash_mpds:
1721 dash_mpds.append(dash_mpd[0])
1722
1723 def add_dash_mpd_pr(pl_response):
1724 dash_mpd = url_or_none(try_get(
1725 pl_response, lambda x: x['streamingData']['dashManifestUrl'],
1726 compat_str))
1727 if dash_mpd and dash_mpd not in dash_mpds:
1728 dash_mpds.append(dash_mpd)
1729
1730 is_live = None
1731 view_count = None
1732
1733 def extract_view_count(v_info):
1734 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1735
1736 def extract_player_response(player_response, video_id):
1737 pl_response = str_or_none(player_response)
1738 if not pl_response:
1739 return
1740 pl_response = self._parse_json(pl_response, video_id, fatal=False)
1741 if isinstance(pl_response, dict):
1742 add_dash_mpd_pr(pl_response)
1743 return pl_response
1744
1745 def extract_embedded_config(embed_webpage, video_id):
1746 embedded_config = self._search_regex(
1747 r'setConfig\(({.*})\);',
1748 embed_webpage, 'ytInitialData', default=None)
1749 if embedded_config:
1750 return embedded_config
1751
1752 player_response = {}
1753
1754 # Get video info
1755 video_info = {}
1756 embed_webpage = None
1757 if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+'
1758 or re.search(r'player-age-gate-content">', video_webpage) is not None):
1759 cookie_keys = self._get_cookies('https://www.youtube.com').keys()
1760 age_gate = True
1761 # We simulate the access to the video from www.youtube.com/v/{video_id}
1762 # this can be viewed without login into Youtube
1763 url = proto + '://www.youtube.com/embed/%s' % video_id
1764 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
1765 ext = extract_embedded_config(embed_webpage, video_id)
1766 # playabilityStatus = re.search(r'{\\\"status\\\":\\\"(?P<playabilityStatus>[^\"]+)\\\"', ext)
1767 playable_in_embed = re.search(r'{\\\"playableInEmbed\\\":(?P<playableinEmbed>[^\,]+)', ext)
1768 if not playable_in_embed:
1769 self.to_screen('Could not determine whether playabale in embed for video %s' % video_id)
1770 playable_in_embed = ''
1771 else:
1772 playable_in_embed = playable_in_embed.group('playableinEmbed')
1773 # check if video is only playable on youtube in other words not playable in embed - if so it requires auth (cookies)
1774 # if re.search(r'player-unavailable">', embed_webpage) is not None:
1775 if playable_in_embed == 'false':
1776 '''
1777 # TODO apply this patch when Support for Python 2.6(!) and above drops
1778 if ({'VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID'} <= cookie_keys
1779 or {'VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO'} <= cookie_keys):
1780 '''
1781 if (set(('VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID')) <= set(cookie_keys)
1782 or set(('VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO')) <= set(cookie_keys)):
1783 age_gate = False
1784 # Try looking directly into the video webpage
1785 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1786 if ytplayer_config:
1787 args = ytplayer_config['args']
1788 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1789 # Convert to the same format returned by compat_parse_qs
1790 video_info = dict((k, [v]) for k, v in args.items())
1791 add_dash_mpd(video_info)
1792 # Rental video is not rented but preview is available (e.g.
1793 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1794 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1795 if not video_info and args.get('ypc_vid'):
1796 return self.url_result(
1797 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1798 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1799 is_live = True
1800 if not player_response:
1801 player_response = extract_player_response(args.get('player_response'), video_id)
1802 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1803 add_dash_mpd_pr(player_response)
1804 else:
1805 raise ExtractorError('Video is age restricted and only playable on Youtube. Requires cookies!', expected=True)
1806 else:
1807 data = compat_urllib_parse_urlencode({
1808 'video_id': video_id,
1809 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1810 'sts': self._search_regex(
1811 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1812 })
1813 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1814 try:
1815 video_info_webpage = self._download_webpage(
1816 video_info_url, video_id,
1817 note='Refetching age-gated info webpage',
1818 errnote='unable to download video info webpage')
1819 except ExtractorError:
1820 video_info_webpage = None
1821 if video_info_webpage:
1822 video_info = compat_parse_qs(video_info_webpage)
1823 pl_response = video_info.get('player_response', [None])[0]
1824 player_response = extract_player_response(pl_response, video_id)
1825 add_dash_mpd(video_info)
1826 view_count = extract_view_count(video_info)
1827 else:
1828 age_gate = False
1829 # Try looking directly into the video webpage
1830 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1831 if ytplayer_config:
1832 args = ytplayer_config['args']
1833 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1834 # Convert to the same format returned by compat_parse_qs
1835 video_info = dict((k, [v]) for k, v in args.items())
1836 add_dash_mpd(video_info)
1837 # Rental video is not rented but preview is available (e.g.
1838 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1839 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1840 if not video_info and args.get('ypc_vid'):
1841 return self.url_result(
1842 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1843 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1844 is_live = True
1845 if not player_response:
1846 player_response = extract_player_response(args.get('player_response'), video_id)
1847 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1848 add_dash_mpd_pr(player_response)
1849
1850 def extract_unavailable_message():
1851 messages = []
1852 for tag, kind in (('h1', 'message'), ('div', 'submessage')):
1853 msg = self._html_search_regex(
1854 r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
1855 video_webpage, 'unavailable %s' % kind, default=None)
1856 if msg:
1857 messages.append(msg)
1858 if messages:
1859 return '\n'.join(messages)
1860
1861 if not video_info and not player_response:
1862 unavailable_message = extract_unavailable_message()
1863 if not unavailable_message:
1864 unavailable_message = 'Unable to extract video data'
1865 raise ExtractorError(
1866 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
1867
1868 if not isinstance(video_info, dict):
1869 video_info = {}
1870
1871 video_details = try_get(
1872 player_response, lambda x: x['videoDetails'], dict) or {}
1873
1874 microformat = try_get(
1875 player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}
1876
1877 video_title = video_info.get('title', [None])[0] or video_details.get('title')
1878 if not video_title:
1879 self._downloader.report_warning('Unable to extract video title')
1880 video_title = '_'
1881
1882 description_original = video_description = get_element_by_id("eow-description", video_webpage)
1883 if video_description:
1884
1885 def replace_url(m):
1886 redir_url = compat_urlparse.urljoin(url, m.group(1))
1887 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1888 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1889 qs = compat_parse_qs(parsed_redir_url.query)
1890 q = qs.get('q')
1891 if q and q[0]:
1892 return q[0]
1893 return redir_url
1894
1895 description_original = video_description = re.sub(r'''(?x)
1896 <a\s+
1897 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1898 (?:title|href)="([^"]+)"\s+
1899 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1900 class="[^"]*"[^>]*>
1901 [^<]+\.{3}\s*
1902 </a>
1903 ''', replace_url, video_description)
1904 video_description = clean_html(video_description)
1905 else:
1906 video_description = video_details.get('shortDescription')
1907 if video_description is None:
1908 video_description = self._html_search_meta('description', video_webpage)
1909
1910 if not smuggled_data.get('force_singlefeed', False):
1911 if not self._downloader.params.get('noplaylist'):
1912 multifeed_metadata_list = try_get(
1913 player_response,
1914 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
1915 compat_str) or try_get(
1916 video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
1917 if multifeed_metadata_list:
1918 entries = []
1919 feed_ids = []
1920 for feed in multifeed_metadata_list.split(','):
1921 # Unquote should take place before split on comma (,) since textual
1922 # fields may contain comma as well (see
1923 # https://github.com/ytdl-org/youtube-dl/issues/8536)
1924 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
1925
1926 def feed_entry(name):
1927 return try_get(feed_data, lambda x: x[name][0], compat_str)
1928
1929 feed_id = feed_entry('id')
1930 if not feed_id:
1931 continue
1932 feed_title = feed_entry('title')
1933 title = video_title
1934 if feed_title:
1935 title += ' (%s)' % feed_title
1936 entries.append({
1937 '_type': 'url_transparent',
1938 'ie_key': 'Youtube',
1939 'url': smuggle_url(
1940 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1941 {'force_singlefeed': True}),
1942 'title': title,
1943 })
1944 feed_ids.append(feed_id)
1945 self.to_screen(
1946 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1947 % (', '.join(feed_ids), video_id))
1948 return self.playlist_result(entries, video_id, video_title, video_description)
1949 else:
1950 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1951
1952 if view_count is None:
1953 view_count = extract_view_count(video_info)
1954 if view_count is None and video_details:
1955 view_count = int_or_none(video_details.get('viewCount'))
1956 if view_count is None and microformat:
1957 view_count = int_or_none(microformat.get('viewCount'))
1958
1959 if is_live is None:
1960 is_live = bool_or_none(video_details.get('isLive'))
1961
1962 has_live_chat_replay = False
1963 if not is_live:
1964 yt_initial_data = self._get_yt_initial_data(video_id, video_webpage)
1965 try:
1966 yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
1967 has_live_chat_replay = True
1968 except (KeyError, IndexError, TypeError):
1969 pass
1970
1971 # Check for "rental" videos
1972 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1973 raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
1974
1975 def _extract_filesize(media_url):
1976 return int_or_none(self._search_regex(
1977 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
1978
1979 streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
1980 streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
1981
1982 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1983 self.report_rtmp_download()
1984 formats = [{
1985 'format_id': '_rtmp',
1986 'protocol': 'rtmp',
1987 'url': video_info['conn'][0],
1988 'player_url': player_url,
1989 }]
1990 elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
1991 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
1992 if 'rtmpe%3Dyes' in encoded_url_map:
1993 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
1994 formats = []
1995 formats_spec = {}
1996 fmt_list = video_info.get('fmt_list', [''])[0]
1997 if fmt_list:
1998 for fmt in fmt_list.split(','):
1999 spec = fmt.split('/')
2000 if len(spec) > 1:
2001 width_height = spec[1].split('x')
2002 if len(width_height) == 2:
2003 formats_spec[spec[0]] = {
2004 'resolution': spec[1],
2005 'width': int_or_none(width_height[0]),
2006 'height': int_or_none(width_height[1]),
2007 }
2008 for fmt in streaming_formats:
2009 itag = str_or_none(fmt.get('itag'))
2010 if not itag:
2011 continue
2012 quality = fmt.get('quality')
2013 quality_label = fmt.get('qualityLabel') or quality
2014 formats_spec[itag] = {
2015 'asr': int_or_none(fmt.get('audioSampleRate')),
2016 'filesize': int_or_none(fmt.get('contentLength')),
2017 'format_note': quality_label,
2018 'fps': int_or_none(fmt.get('fps')),
2019 'height': int_or_none(fmt.get('height')),
2020 # bitrate for itag 43 is always 2147483647
2021 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
2022 'width': int_or_none(fmt.get('width')),
2023 }
2024
2025 for fmt in streaming_formats:
2026 if fmt.get('drmFamilies') or fmt.get('drm_families'):
2027 continue
2028 url = url_or_none(fmt.get('url'))
2029
2030 if not url:
2031 cipher = fmt.get('cipher') or fmt.get('signatureCipher')
2032 if not cipher:
2033 continue
2034 url_data = compat_parse_qs(cipher)
2035 url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
2036 if not url:
2037 continue
2038 else:
2039 cipher = None
2040 url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2041
2042 stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
2043 # Unsupported FORMAT_STREAM_TYPE_OTF
2044 if stream_type == 3:
2045 continue
2046
2047 format_id = fmt.get('itag') or url_data['itag'][0]
2048 if not format_id:
2049 continue
2050 format_id = compat_str(format_id)
2051
2052 if cipher:
2053 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
2054 ASSETS_RE = r'(?:"assets":.+?"js":\s*("[^"]+"))|(?:"jsUrl":\s*("[^"]+"))'
2055 jsplayer_url_json = self._search_regex(
2056 ASSETS_RE,
2057 embed_webpage if age_gate else video_webpage,
2058 'JS player URL (1)', default=None)
2059 if not jsplayer_url_json and not age_gate:
2060 # We need the embed website after all
2061 if embed_webpage is None:
2062 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
2063 embed_webpage = self._download_webpage(
2064 embed_url, video_id, 'Downloading embed webpage')
2065 jsplayer_url_json = self._search_regex(
2066 ASSETS_RE, embed_webpage, 'JS player URL')
2067
2068 player_url = json.loads(jsplayer_url_json)
2069 if player_url is None:
2070 player_url_json = self._search_regex(
2071 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
2072 video_webpage, 'age gate player URL')
2073 player_url = json.loads(player_url_json)
2074
2075 if 'sig' in url_data:
2076 url += '&signature=' + url_data['sig'][0]
2077 elif 's' in url_data:
2078 encrypted_sig = url_data['s'][0]
2079
2080 if self._downloader.params.get('verbose'):
2081 if player_url is None:
2082 player_desc = 'unknown'
2083 else:
2084 player_type, player_version = self._extract_player_info(player_url)
2085 player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version)
2086 parts_sizes = self._signature_cache_id(encrypted_sig)
2087 self.to_screen('{%s} signature length %s, %s' %
2088 (format_id, parts_sizes, player_desc))
2089
2090 signature = self._decrypt_signature(
2091 encrypted_sig, video_id, player_url, age_gate)
2092 sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
2093 url += '&%s=%s' % (sp, signature)
2094 if 'ratebypass' not in url:
2095 url += '&ratebypass=yes'
2096
2097 dct = {
2098 'format_id': format_id,
2099 'url': url,
2100 'player_url': player_url,
2101 }
2102 if format_id in self._formats:
2103 dct.update(self._formats[format_id])
2104 if format_id in formats_spec:
2105 dct.update(formats_spec[format_id])
2106
2107 # Some itags are not included in DASH manifest thus corresponding formats will
2108 # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).
2109 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
2110 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
2111 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
2112
2113 if width is None:
2114 width = int_or_none(fmt.get('width'))
2115 if height is None:
2116 height = int_or_none(fmt.get('height'))
2117
2118 filesize = int_or_none(url_data.get(
2119 'clen', [None])[0]) or _extract_filesize(url)
2120
2121 quality = url_data.get('quality', [None])[0] or fmt.get('quality')
2122 quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
2123
2124 tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
2125 or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
2126 fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
2127
2128 more_fields = {
2129 'filesize': filesize,
2130 'tbr': tbr,
2131 'width': width,
2132 'height': height,
2133 'fps': fps,
2134 'format_note': quality_label or quality,
2135 }
2136 for key, value in more_fields.items():
2137 if value:
2138 dct[key] = value
2139 type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
2140 if type_:
2141 type_split = type_.split(';')
2142 kind_ext = type_split[0].split('/')
2143 if len(kind_ext) == 2:
2144 kind, _ = kind_ext
2145 dct['ext'] = mimetype2ext(type_split[0])
2146 if kind in ('audio', 'video'):
2147 codecs = None
2148 for mobj in re.finditer(
2149 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
2150 if mobj.group('key') == 'codecs':
2151 codecs = mobj.group('val')
2152 break
2153 if codecs:
2154 dct.update(parse_codecs(codecs))
2155 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
2156 dct['downloader_options'] = {
2157 # Youtube throttles chunks >~10M
2158 'http_chunk_size': 10485760,
2159 }
2160 formats.append(dct)
2161 else:
2162 manifest_url = (
2163 url_or_none(try_get(
2164 player_response,
2165 lambda x: x['streamingData']['hlsManifestUrl'],
2166 compat_str))
2167 or url_or_none(try_get(
2168 video_info, lambda x: x['hlsvp'][0], compat_str)))
2169 if manifest_url:
2170 formats = []
2171 m3u8_formats = self._extract_m3u8_formats(
2172 manifest_url, video_id, 'mp4', fatal=False)
2173 for a_format in m3u8_formats:
2174 itag = self._search_regex(
2175 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
2176 if itag:
2177 a_format['format_id'] = itag
2178 if itag in self._formats:
2179 dct = self._formats[itag].copy()
2180 dct.update(a_format)
2181 a_format = dct
2182 a_format['player_url'] = player_url
2183 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
2184 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
2185 if self._downloader.params.get('youtube_include_hls_manifest', True):
2186 formats.append(a_format)
2187 else:
2188 error_message = extract_unavailable_message()
2189 if not error_message:
2190 error_message = clean_html(try_get(
2191 player_response, lambda x: x['playabilityStatus']['reason'],
2192 compat_str))
2193 if not error_message:
2194 error_message = clean_html(
2195 try_get(video_info, lambda x: x['reason'][0], compat_str))
2196 if error_message:
2197 raise ExtractorError(error_message, expected=True)
2198 raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
2199
2200 # uploader
2201 video_uploader = try_get(
2202 video_info, lambda x: x['author'][0],
2203 compat_str) or str_or_none(video_details.get('author'))
2204 if video_uploader:
2205 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
2206 else:
2207 self._downloader.report_warning('unable to extract uploader name')
2208
2209 # uploader_id
2210 video_uploader_id = None
2211 video_uploader_url = None
2212 mobj = re.search(
2213 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
2214 video_webpage)
2215 if mobj is not None:
2216 video_uploader_id = mobj.group('uploader_id')
2217 video_uploader_url = mobj.group('uploader_url')
2218 else:
2219 owner_profile_url = url_or_none(microformat.get('ownerProfileUrl'))
2220 if owner_profile_url:
2221 video_uploader_id = self._search_regex(
2222 r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id',
2223 default=None)
2224 video_uploader_url = owner_profile_url
2225
2226 channel_id = (
2227 str_or_none(video_details.get('channelId'))
2228 or self._html_search_meta(
2229 'channelId', video_webpage, 'channel id', default=None)
2230 or self._search_regex(
2231 r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
2232 video_webpage, 'channel id', default=None, group='id'))
2233 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
2234
2235 thumbnails = []
2236 thumbnails_list = try_get(
2237 video_details, lambda x: x['thumbnail']['thumbnails'], list) or []
2238 for t in thumbnails_list:
2239 if not isinstance(t, dict):
2240 continue
2241 thumbnail_url = url_or_none(t.get('url'))
2242 if not thumbnail_url:
2243 continue
2244 thumbnails.append({
2245 'url': thumbnail_url,
2246 'width': int_or_none(t.get('width')),
2247 'height': int_or_none(t.get('height')),
2248 })
2249
2250 if not thumbnails:
2251 video_thumbnail = None
2252 # We try first to get a high quality image:
2253 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
2254 video_webpage, re.DOTALL)
2255 if m_thumb is not None:
2256 video_thumbnail = m_thumb.group(1)
2257 thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str)
2258 if thumbnail_url:
2259 video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url)
2260 if video_thumbnail:
2261 thumbnails.append({'url': video_thumbnail})
2262
2263 # upload date
2264 upload_date = self._html_search_meta(
2265 'datePublished', video_webpage, 'upload date', default=None)
2266 if not upload_date:
2267 upload_date = self._search_regex(
2268 [r'(?s)id="eow-date.*?>(.*?)</span>',
2269 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
2270 video_webpage, 'upload date', default=None)
2271 if not upload_date:
2272 upload_date = microformat.get('publishDate') or microformat.get('uploadDate')
2273 upload_date = unified_strdate(upload_date)
2274
2275 video_license = self._html_search_regex(
2276 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
2277 video_webpage, 'license', default=None)
2278
2279 m_music = re.search(
2280 r'''(?x)
2281 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
2282 <ul[^>]*>\s*
2283 <li>(?P<title>.+?)
2284 by (?P<creator>.+?)
2285 (?:
2286 \(.+?\)|
2287 <a[^>]*
2288 (?:
2289 \bhref=["\']/red[^>]*>| # drop possible
2290 >\s*Listen ad-free with YouTube Red # YouTube Red ad
2291 )
2292 .*?
2293 )?</li
2294 ''',
2295 video_webpage)
2296 if m_music:
2297 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
2298 video_creator = clean_html(m_music.group('creator'))
2299 else:
2300 video_alt_title = video_creator = None
2301
2302 def extract_meta(field):
2303 return self._html_search_regex(
2304 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
2305 video_webpage, field, default=None)
2306
2307 track = extract_meta('Song')
2308 artist = extract_meta('Artist')
2309 album = extract_meta('Album')
2310
2311 # Youtube Music Auto-generated description
2312 release_date = release_year = None
2313 if video_description:
2314 mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description)
2315 if mobj:
2316 if not track:
2317 track = mobj.group('track').strip()
2318 if not artist:
2319 artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))
2320 if not album:
2321 album = mobj.group('album'.strip())
2322 release_year = mobj.group('release_year')
2323 release_date = mobj.group('release_date')
2324 if release_date:
2325 release_date = release_date.replace('-', '')
2326 if not release_year:
2327 release_year = int(release_date[:4])
2328 if release_year:
2329 release_year = int(release_year)
2330
2331 m_episode = re.search(
2332 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
2333 video_webpage)
2334 if m_episode:
2335 series = unescapeHTML(m_episode.group('series'))
2336 season_number = int(m_episode.group('season'))
2337 episode_number = int(m_episode.group('episode'))
2338 else:
2339 series = season_number = episode_number = None
2340
2341 m_cat_container = self._search_regex(
2342 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
2343 video_webpage, 'categories', default=None)
2344 category = None
2345 if m_cat_container:
2346 category = self._html_search_regex(
2347 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
2348 default=None)
2349 if not category:
2350 category = try_get(
2351 microformat, lambda x: x['category'], compat_str)
2352 video_categories = None if category is None else [category]
2353
2354 video_tags = [
2355 unescapeHTML(m.group('content'))
2356 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
2357 if not video_tags:
2358 video_tags = try_get(video_details, lambda x: x['keywords'], list)
2359
2360 def _extract_count(count_name):
2361 return str_to_int(self._search_regex(
2362 r'"accessibilityData":\{"label":"([\d,\w]+) %ss"\}'
2363 % re.escape(count_name),
2364 video_webpage, count_name, default=None))
2365
2366 like_count = _extract_count('like')
2367 dislike_count = _extract_count('dislike')
2368
2369 if view_count is None:
2370 view_count = str_to_int(self._search_regex(
2371 r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
2372 'view count', default=None))
2373
2374 average_rating = (
2375 float_or_none(video_details.get('averageRating'))
2376 or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
2377
2378 # subtitles
2379 video_subtitles = self.extract_subtitles(
2380 video_id, video_webpage, has_live_chat_replay)
2381 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
2382
2383 video_duration = try_get(
2384 video_info, lambda x: int_or_none(x['length_seconds'][0]))
2385 if not video_duration:
2386 video_duration = int_or_none(video_details.get('lengthSeconds'))
2387 if not video_duration:
2388 video_duration = parse_duration(self._html_search_meta(
2389 'duration', video_webpage, 'video duration'))
2390
2391 # Get Subscriber Count of channel
2392 subscriber_count = parse_count(self._search_regex(
2393 r'"text":"([\d\.]+\w?) subscribers"',
2394 video_webpage,
2395 'subscriber count',
2396 default=None
2397 ))
2398
2399 # annotations
2400 video_annotations = None
2401 if self._downloader.params.get('writeannotations', False):
2402 xsrf_token = self._search_regex(
2403 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2',
2404 video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
2405 invideo_url = try_get(
2406 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
2407 if xsrf_token and invideo_url:
2408 xsrf_field_name = self._search_regex(
2409 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
2410 video_webpage, 'xsrf field name',
2411 group='xsrf_field_name', default='session_token')
2412 video_annotations = self._download_webpage(
2413 self._proto_relative_url(invideo_url),
2414 video_id, note='Downloading annotations',
2415 errnote='Unable to download video annotations', fatal=False,
2416 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
2417
2418 chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
2419
2420 # Look for the DASH manifest
2421 if self._downloader.params.get('youtube_include_dash_manifest', True):
2422 dash_mpd_fatal = True
2423 for mpd_url in dash_mpds:
2424 dash_formats = {}
2425 try:
2426 def decrypt_sig(mobj):
2427 s = mobj.group(1)
2428 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2429 return '/signature/%s' % dec_s
2430
2431 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2432
2433 for df in self._extract_mpd_formats(
2434 mpd_url, video_id, fatal=dash_mpd_fatal,
2435 formats_dict=self._formats):
2436 if not df.get('filesize'):
2437 df['filesize'] = _extract_filesize(df['url'])
2438 # Do not overwrite DASH format found in some previous DASH manifest
2439 if df['format_id'] not in dash_formats:
2440 dash_formats[df['format_id']] = df
2441 # Additional DASH manifests may end up in HTTP Error 403 therefore
2442 # allow them to fail without bug report message if we already have
2443 # some DASH manifest succeeded. This is temporary workaround to reduce
2444 # burst of bug reports until we figure out the reason and whether it
2445 # can be fixed at all.
2446 dash_mpd_fatal = False
2447 except (ExtractorError, KeyError) as e:
2448 self.report_warning(
2449 'Skipping DASH manifest: %r' % e, video_id)
2450 if dash_formats:
2451 # Remove the formats we found through non-DASH, they
2452 # contain less info and it can be wrong, because we use
2453 # fixed values (for example the resolution). See
2454 # https://github.com/ytdl-org/youtube-dl/issues/5774 for an
2455 # example.
2456 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
2457 formats.extend(dash_formats.values())
2458
2459 # Check for malformed aspect ratio
2460 stretched_m = re.search(
2461 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2462 video_webpage)
2463 if stretched_m:
2464 w = float(stretched_m.group('w'))
2465 h = float(stretched_m.group('h'))
2466 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2467 # We will only process correct ratios.
2468 if w > 0 and h > 0:
2469 ratio = w / h
2470 for f in formats:
2471 if f.get('vcodec') != 'none':
2472 f['stretched_ratio'] = ratio
2473
2474 if not formats:
2475 if 'reason' in video_info:
2476 if 'The uploader has not made this video available in your country.' in video_info['reason']:
2477 regions_allowed = self._html_search_meta(
2478 'regionsAllowed', video_webpage, default=None)
2479 countries = regions_allowed.split(',') if regions_allowed else None
2480 self.raise_geo_restricted(
2481 msg=video_info['reason'][0], countries=countries)
2482 reason = video_info['reason'][0]
2483 if 'Invalid parameters' in reason:
2484 unavailable_message = extract_unavailable_message()
2485 if unavailable_message:
2486 reason = unavailable_message
2487 raise ExtractorError(
2488 'YouTube said: %s' % reason,
2489 expected=True, video_id=video_id)
2490 if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']):
2491 raise ExtractorError('This video is DRM protected.', expected=True)
2492
2493 self._sort_formats(formats)
2494
2495 self.mark_watched(video_id, video_info, player_response)
2496
2497 return {
2498 'id': video_id,
2499 'uploader': video_uploader,
2500 'uploader_id': video_uploader_id,
2501 'uploader_url': video_uploader_url,
2502 'channel_id': channel_id,
2503 'channel_url': channel_url,
2504 'upload_date': upload_date,
2505 'license': video_license,
2506 'creator': video_creator or artist,
2507 'title': video_title,
2508 'alt_title': video_alt_title or track,
2509 'thumbnails': thumbnails,
2510 'description': video_description,
2511 'categories': video_categories,
2512 'tags': video_tags,
2513 'subtitles': video_subtitles,
2514 'automatic_captions': automatic_captions,
2515 'duration': video_duration,
2516 'age_limit': 18 if age_gate else 0,
2517 'annotations': video_annotations,
2518 'chapters': chapters,
2519 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
2520 'view_count': view_count,
2521 'like_count': like_count,
2522 'dislike_count': dislike_count,
2523 'average_rating': average_rating,
2524 'formats': formats,
2525 'is_live': is_live,
2526 'start_time': start_time,
2527 'end_time': end_time,
2528 'series': series,
2529 'season_number': season_number,
2530 'episode_number': episode_number,
2531 'track': track,
2532 'artist': artist,
2533 'album': album,
2534 'release_date': release_date,
2535 'release_year': release_year,
2536 'subscriber_count': subscriber_count,
2537 }
2538
2539
2540 class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
2541 IE_DESC = 'YouTube.com playlists'
2542 _VALID_URL = r"""(?x)(?:
2543 (?:https?://)?
2544 (?:\w+\.)?
2545 (?:
2546 (?:
2547 youtube(?:kids)?\.com|
2548 invidio\.us
2549 )
2550 /
2551 (?:
2552 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))
2553 \? (?:.*?[&;])*? (?:p|a|list)=
2554 | p/
2555 )|
2556 youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
2557 )
2558 (
2559 (?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)?[0-9A-Za-z-_]{10,}
2560 # Top tracks, they can also include dots
2561 |(?:MC)[\w\.]*
2562 )
2563 .*
2564 |
2565 (%(playlist_id)s)
2566 )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
2567 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
2568 _VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&amp;(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?'
2569 _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})'
2570 IE_NAME = 'youtube:playlist'
2571 _TESTS = [{
2572 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2573 'info_dict': {
2574 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2575 'uploader': 'Sergey M.',
2576 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2577 'title': 'youtube-dl public playlist',
2578 },
2579 'playlist_count': 1,
2580 }, {
2581 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2582 'info_dict': {
2583 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2584 'uploader': 'Sergey M.',
2585 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2586 'title': 'youtube-dl empty playlist',
2587 },
2588 'playlist_count': 0,
2589 }, {
2590 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2591 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2592 'info_dict': {
2593 'title': '29C3: Not my department',
2594 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2595 'uploader': 'Christiaan008',
2596 'uploader_id': 'ChRiStIaAn008',
2597 },
2598 'playlist_count': 96,
2599 }, {
2600 'note': 'issue #673',
2601 'url': 'PLBB231211A4F62143',
2602 'info_dict': {
2603 'title': '[OLD]Team Fortress 2 (Class-based LP)',
2604 'id': 'PLBB231211A4F62143',
2605 'uploader': 'Wickydoo',
2606 'uploader_id': 'Wickydoo',
2607 },
2608 'playlist_mincount': 26,
2609 }, {
2610 'note': 'Large playlist',
2611 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
2612 'info_dict': {
2613 'title': 'Uploads from Cauchemar',
2614 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
2615 'uploader': 'Cauchemar',
2616 'uploader_id': 'Cauchemar89',
2617 },
2618 'playlist_mincount': 799,
2619 }, {
2620 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2621 'info_dict': {
2622 'title': 'YDL_safe_search',
2623 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2624 },
2625 'playlist_count': 2,
2626 'skip': 'This playlist is private',
2627 }, {
2628 'note': 'embedded',
2629 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
2630 'playlist_count': 4,
2631 'info_dict': {
2632 'title': 'JODA15',
2633 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
2634 'uploader': 'milan',
2635 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
2636 }
2637 }, {
2638 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2639 'playlist_mincount': 485,
2640 'info_dict': {
2641 'title': '2018 Chinese New Singles (11/6 updated)',
2642 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2643 'uploader': 'LBK',
2644 'uploader_id': 'sdragonfang',
2645 }
2646 }, {
2647 'note': 'Embedded SWF player',
2648 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
2649 'playlist_count': 4,
2650 'info_dict': {
2651 'title': 'JODA7',
2652 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
2653 },
2654 'skip': 'This playlist does not exist',
2655 }, {
2656 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2657 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2658 'info_dict': {
2659 'title': 'Uploads from Interstellar Movie',
2660 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
2661 'uploader': 'Interstellar Movie',
2662 'uploader_id': 'InterstellarMovie1',
2663 },
2664 'playlist_mincount': 21,
2665 }, {
2666 # Playlist URL that does not actually serve a playlist
2667 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2668 'info_dict': {
2669 'id': 'FqZTN594JQw',
2670 'ext': 'webm',
2671 'title': "Smiley's People 01 detective, Adventure Series, Action",
2672 'uploader': 'STREEM',
2673 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
2674 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
2675 'upload_date': '20150526',
2676 'license': 'Standard YouTube License',
2677 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2678 'categories': ['People & Blogs'],
2679 'tags': list,
2680 'view_count': int,
2681 'like_count': int,
2682 'dislike_count': int,
2683 },
2684 'params': {
2685 'skip_download': True,
2686 },
2687 'skip': 'This video is not available.',
2688 'add_ie': [YoutubeIE.ie_key()],
2689 }, {
2690 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
2691 'info_dict': {
2692 'id': 'yeWKywCrFtk',
2693 'ext': 'mp4',
2694 'title': 'Small Scale Baler and Braiding Rugs',
2695 'uploader': 'Backus-Page House Museum',
2696 'uploader_id': 'backuspagemuseum',
2697 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
2698 'upload_date': '20161008',
2699 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
2700 'categories': ['Nonprofits & Activism'],
2701 'tags': list,
2702 'like_count': int,
2703 'dislike_count': int,
2704 },
2705 'params': {
2706 'noplaylist': True,
2707 'skip_download': True,
2708 },
2709 }, {
2710 # https://github.com/ytdl-org/youtube-dl/issues/21844
2711 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2712 'info_dict': {
2713 'title': 'Data Analysis with Dr Mike Pound',
2714 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2715 'uploader_id': 'Computerphile',
2716 'uploader': 'Computerphile',
2717 },
2718 'playlist_mincount': 11,
2719 }, {
2720 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
2721 'only_matching': True,
2722 }, {
2723 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
2724 'only_matching': True,
2725 }, {
2726 # music album playlist
2727 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
2728 'only_matching': True,
2729 }, {
2730 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',
2731 'only_matching': True,
2732 }, {
2733 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
2734 'only_matching': True,
2735 }]
2736
2737 def _real_initialize(self):
2738 self._login()
2739
2740 def extract_videos_from_page(self, page):
2741 ids_in_page = []
2742 titles_in_page = []
2743
2744 for item in re.findall(
2745 r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page):
2746 attrs = extract_attributes(item)
2747 video_id = attrs['data-video-id']
2748 video_title = unescapeHTML(attrs.get('data-title'))
2749 if video_title:
2750 video_title = video_title.strip()
2751 ids_in_page.append(video_id)
2752 titles_in_page.append(video_title)
2753
2754 # Fallback with old _VIDEO_RE
2755 self.extract_videos_from_page_impl(
2756 self._VIDEO_RE, page, ids_in_page, titles_in_page)
2757
2758 # Relaxed fallbacks
2759 self.extract_videos_from_page_impl(
2760 r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page,
2761 ids_in_page, titles_in_page)
2762 self.extract_videos_from_page_impl(
2763 r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page,
2764 ids_in_page, titles_in_page)
2765
2766 return zip(ids_in_page, titles_in_page)
2767
2768 def _extract_mix(self, playlist_id):
2769 # The mixes are generated from a single video
2770 # the id of the playlist is just 'RD' + video_id
2771 ids = []
2772 last_id = playlist_id[-11:]
2773 for n in itertools.count(1):
2774 url = 'https://www.youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
2775 webpage = self._download_webpage(
2776 url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))
2777 new_ids = orderedSet(re.findall(
2778 r'''(?xs)data-video-username=".*?".*?
2779 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
2780 webpage))
2781 # Fetch new pages until all the videos are repeated, it seems that
2782 # there are always 51 unique videos.
2783 new_ids = [_id for _id in new_ids if _id not in ids]
2784 if not new_ids:
2785 break
2786 ids.extend(new_ids)
2787 last_id = ids[-1]
2788
2789 url_results = self._ids_to_results(ids)
2790
2791 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
2792 title_span = (
2793 search_title('playlist-title')
2794 or search_title('title long-title')
2795 or search_title('title'))
2796 title = clean_html(title_span)
2797
2798 return self.playlist_result(url_results, playlist_id, title)
2799
2800 def _extract_playlist(self, playlist_id):
2801 url = self._TEMPLATE_URL % playlist_id
2802 page = self._download_webpage(url, playlist_id)
2803
2804 # the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604)
2805 for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page):
2806 match = match.strip()
2807 # Check if the playlist exists or is private
2808 mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match)
2809 if mobj:
2810 reason = mobj.group('reason')
2811 message = 'This playlist %s' % reason
2812 if 'private' in reason:
2813 message += ', use --username or --netrc to access it'
2814 message += '.'
2815 raise ExtractorError(message, expected=True)
2816 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
2817 raise ExtractorError(
2818 'Invalid parameters. Maybe URL is incorrect.',
2819 expected=True)
2820 elif re.match(r'[^<]*Choose your language[^<]*', match):
2821 continue
2822 else:
2823 self.report_warning('Youtube gives an alert message: ' + match)
2824
2825 playlist_title = self._html_search_regex(
2826 r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
2827 page, 'title', default=None)
2828
2829 _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref='
2830 uploader = self._html_search_regex(
2831 r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,
2832 page, 'uploader', default=None)
2833 mobj = re.search(
2834 r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE,
2835 page)
2836 if mobj:
2837 uploader_id = mobj.group('uploader_id')
2838 uploader_url = compat_urlparse.urljoin(url, mobj.group('path'))
2839 else:
2840 uploader_id = uploader_url = None
2841
2842 has_videos = True
2843
2844 if not playlist_title:
2845 try:
2846 # Some playlist URLs don't actually serve a playlist (e.g.
2847 # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)
2848 next(self._entries(page, playlist_id))
2849 except StopIteration:
2850 has_videos = False
2851
2852 playlist = self.playlist_result(
2853 self._entries(page, playlist_id), playlist_id, playlist_title)
2854 playlist.update({
2855 'uploader': uploader,
2856 'uploader_id': uploader_id,
2857 'uploader_url': uploader_url,
2858 })
2859
2860 return has_videos, playlist
2861
2862 def _check_download_just_video(self, url, playlist_id):
2863 # Check if it's a video-specific URL
2864 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
2865 video_id = query_dict.get('v', [None])[0] or self._search_regex(
2866 r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url,
2867 'video id', default=None)
2868 if video_id:
2869 if self._downloader.params.get('noplaylist'):
2870 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
2871 return video_id, self.url_result(video_id, 'Youtube', video_id=video_id)
2872 else:
2873 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
2874 return video_id, None
2875 return None, None
2876
2877 def _real_extract(self, url):
2878 # Extract playlist id
2879 mobj = re.match(self._VALID_URL, url)
2880 if mobj is None:
2881 raise ExtractorError('Invalid URL: %s' % url)
2882 playlist_id = mobj.group(1) or mobj.group(2)
2883
2884 video_id, video = self._check_download_just_video(url, playlist_id)
2885 if video:
2886 return video
2887
2888 if playlist_id.startswith(('RD', 'UL', 'PU')):
2889 # Mixes require a custom extraction process
2890 return self._extract_mix(playlist_id)
2891
2892 has_videos, playlist = self._extract_playlist(playlist_id)
2893 if has_videos or not video_id:
2894 return playlist
2895
2896 # Some playlist URLs don't actually serve a playlist (see
2897 # https://github.com/ytdl-org/youtube-dl/issues/10537).
2898 # Fallback to plain video extraction if there is a video id
2899 # along with playlist id.
2900 return self.url_result(video_id, 'Youtube', video_id=video_id)
2901
2902
2903 class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
2904 IE_DESC = 'YouTube.com channels'
2905 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie|kids)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)'
2906 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
2907 _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
2908 IE_NAME = 'youtube:channel'
2909 _TESTS = [{
2910 'note': 'paginated channel',
2911 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
2912 'playlist_mincount': 91,
2913 'info_dict': {
2914 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
2915 'title': 'Uploads from lex will',
2916 'uploader': 'lex will',
2917 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2918 }
2919 }, {
2920 'note': 'Age restricted channel',
2921 # from https://www.youtube.com/user/DeusExOfficial
2922 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',
2923 'playlist_mincount': 64,
2924 'info_dict': {
2925 'id': 'UUs0ifCMCm1icqRbqhUINa0w',
2926 'title': 'Uploads from Deus Ex',
2927 'uploader': 'Deus Ex',
2928 'uploader_id': 'DeusExOfficial',
2929 },
2930 }, {
2931 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',
2932 'only_matching': True,
2933 }, {
2934 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA',
2935 'only_matching': True,
2936 }]
2937
2938 @classmethod
2939 def suitable(cls, url):
2940 return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)
2941 else super(YoutubeChannelIE, cls).suitable(url))
2942
2943 def _build_template_url(self, url, channel_id):
2944 return self._TEMPLATE_URL % channel_id
2945
2946 def _real_extract(self, url):
2947 channel_id = self._match_id(url)
2948
2949 url = self._build_template_url(url, channel_id)
2950
2951 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
2952 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
2953 # otherwise fallback on channel by page extraction
2954 channel_page = self._download_webpage(
2955 url + '?view=57', channel_id,
2956 'Downloading channel page', fatal=False)
2957 if channel_page is False:
2958 channel_playlist_id = False
2959 else:
2960 channel_playlist_id = self._html_search_meta(
2961 'channelId', channel_page, 'channel id', default=None)
2962 if not channel_playlist_id:
2963 channel_url = self._html_search_meta(
2964 ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),
2965 channel_page, 'channel url', default=None)
2966 if channel_url:
2967 channel_playlist_id = self._search_regex(
2968 r'vnd\.youtube://user/([0-9A-Za-z_-]+)',
2969 channel_url, 'channel id', default=None)
2970 if channel_playlist_id and channel_playlist_id.startswith('UC'):
2971 playlist_id = 'UU' + channel_playlist_id[2:]
2972 return self.url_result(
2973 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
2974
2975 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
2976 autogenerated = re.search(r'''(?x)
2977 class="[^"]*?(?:
2978 channel-header-autogenerated-label|
2979 yt-channel-title-autogenerated
2980 )[^"]*"''', channel_page) is not None
2981
2982 if autogenerated:
2983 # The videos are contained in a single page
2984 # the ajax pages can't be used, they are empty
2985 entries = [
2986 self.url_result(
2987 video_id, 'Youtube', video_id=video_id,
2988 video_title=video_title)
2989 for video_id, video_title in self.extract_videos_from_page(channel_page)]
2990 return self.playlist_result(entries, channel_id)
2991
2992 try:
2993 next(self._entries(channel_page, channel_id))
2994 except StopIteration:
2995 alert_message = self._html_search_regex(
2996 r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',
2997 channel_page, 'alert', default=None, group='alert')
2998 if alert_message:
2999 raise ExtractorError('Youtube said: %s' % alert_message, expected=True)
3000
3001 return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
3002
3003
3004 class YoutubeUserIE(YoutubeChannelIE):
3005 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
3006 _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9%-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_%-]+)'
3007 _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
3008 IE_NAME = 'youtube:user'
3009
3010 _TESTS = [{
3011 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
3012 'playlist_mincount': 320,
3013 'info_dict': {
3014 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
3015 'title': 'Uploads from The Linux Foundation',
3016 'uploader': 'The Linux Foundation',
3017 'uploader_id': 'TheLinuxFoundation',
3018 }
3019 }, {
3020 # Only available via https://www.youtube.com/c/12minuteathlete/videos
3021 # but not https://www.youtube.com/user/12minuteathlete/videos
3022 'url': 'https://www.youtube.com/c/12minuteathlete/videos',
3023 'playlist_mincount': 249,
3024 'info_dict': {
3025 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
3026 'title': 'Uploads from 12 Minute Athlete',
3027 'uploader': '12 Minute Athlete',
3028 'uploader_id': 'the12minuteathlete',
3029 }
3030 }, {
3031 'url': 'ytuser:phihag',
3032 'only_matching': True,
3033 }, {
3034 'url': 'https://www.youtube.com/c/gametrailers',
3035 'only_matching': True,
3036 }, {
3037 'url': 'https://www.youtube.com/c/Pawe%C5%82Zadro%C5%BCniak',
3038 'only_matching': True,
3039 }, {
3040 'url': 'https://www.youtube.com/gametrailers',
3041 'only_matching': True,
3042 }, {
3043 # This channel is not available, geo restricted to JP
3044 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',
3045 'only_matching': True,
3046 }]
3047
3048 @classmethod
3049 def suitable(cls, url):
3050 # Don't return True if the url can be extracted with other youtube
3051 # extractor, the regex would is too permissive and it would match.
3052 other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)
3053 if any(ie.suitable(url) for ie in other_yt_ies):
3054 return False
3055 else:
3056 return super(YoutubeUserIE, cls).suitable(url)
3057
3058 def _build_template_url(self, url, channel_id):
3059 mobj = re.match(self._VALID_URL, url)
3060 return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))
3061
3062
3063 class YoutubeLiveIE(YoutubeBaseInfoExtractor):
3064 IE_DESC = 'YouTube.com live streams'
3065 _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'
3066 IE_NAME = 'youtube:live'
3067
3068 _TESTS = [{
3069 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
3070 'info_dict': {
3071 'id': 'a48o2S1cPoo',
3072 'ext': 'mp4',
3073 'title': 'The Young Turks - Live Main Show',
3074 'uploader': 'The Young Turks',
3075 'uploader_id': 'TheYoungTurks',
3076 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
3077 'upload_date': '20150715',
3078 'license': 'Standard YouTube License',
3079 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
3080 'categories': ['News & Politics'],
3081 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
3082 'like_count': int,
3083 'dislike_count': int,
3084 },
3085 'params': {
3086 'skip_download': True,
3087 },
3088 }, {
3089 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
3090 'only_matching': True,
3091 }, {
3092 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
3093 'only_matching': True,
3094 }, {
3095 'url': 'https://www.youtube.com/TheYoungTurks/live',
3096 'only_matching': True,
3097 }]
3098
3099 def _real_extract(self, url):
3100 mobj = re.match(self._VALID_URL, url)
3101 channel_id = mobj.group('id')
3102 base_url = mobj.group('base_url')
3103 webpage = self._download_webpage(url, channel_id, fatal=False)
3104 if webpage:
3105 page_type = self._og_search_property(
3106 'type', webpage, 'page type', default='')
3107 video_id = self._html_search_meta(
3108 'videoId', webpage, 'video id', default=None)
3109 if page_type.startswith('video') and video_id and re.match(
3110 r'^[0-9A-Za-z_-]{11}$', video_id):
3111 return self.url_result(video_id, YoutubeIE.ie_key())
3112 return self.url_result(base_url)
3113
3114
3115 class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
3116 IE_DESC = 'YouTube.com user/channel playlists'
3117 _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel|c)/(?P<id>[^/]+)/playlists'
3118 IE_NAME = 'youtube:playlists'
3119
3120 _TESTS = [{
3121 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
3122 'playlist_mincount': 4,
3123 'info_dict': {
3124 'id': 'ThirstForScience',
3125 'title': 'ThirstForScience',
3126 },
3127 }, {
3128 # with "Load more" button
3129 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
3130 'playlist_mincount': 70,
3131 'info_dict': {
3132 'id': 'igorkle1',
3133 'title': 'Игорь Клейнер',
3134 },
3135 }, {
3136 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',
3137 'playlist_mincount': 17,
3138 'info_dict': {
3139 'id': 'UCiU1dHvZObB2iP6xkJ__Icw',
3140 'title': 'Chem Player',
3141 },
3142 'skip': 'Blocked',
3143 }, {
3144 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
3145 'only_matching': True,
3146 }]
3147
3148
3149 class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):
3150 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'
3151
3152
3153 class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
3154 IE_DESC = 'YouTube.com searches'
3155 # there doesn't appear to be a real limit, for example if you search for
3156 # 'python' you get more than 8.000.000 results
3157 _MAX_RESULTS = float('inf')
3158 IE_NAME = 'youtube:search'
3159 _SEARCH_KEY = 'ytsearch'
3160 _SEARCH_PARAMS = None
3161 _TESTS = []
3162
3163 def _entries(self, query, n):
3164 data = {
3165 'context': {
3166 'client': {
3167 'clientName': 'WEB',
3168 'clientVersion': '2.20201021.03.00',
3169 }
3170 },
3171 'query': query,
3172 }
3173 if self._SEARCH_PARAMS:
3174 data['params'] = self._SEARCH_PARAMS
3175 total = 0
3176 for page_num in itertools.count(1):
3177 search = self._download_json(
3178 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
3179 video_id='query "%s"' % query,
3180 note='Downloading page %s' % page_num,
3181 errnote='Unable to download API page', fatal=False,
3182 data=json.dumps(data).encode('utf8'),
3183 headers={'content-type': 'application/json'})
3184 if not search:
3185 break
3186 slr_contents = try_get(
3187 search,
3188 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
3189 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
3190 list)
3191 if not slr_contents:
3192 break
3193 isr_contents = try_get(
3194 slr_contents,
3195 lambda x: x[0]['itemSectionRenderer']['contents'],
3196 list)
3197 if not isr_contents:
3198 break
3199 for content in isr_contents:
3200 if not isinstance(content, dict):
3201 continue
3202 video = content.get('videoRenderer')
3203 if not isinstance(video, dict):
3204 continue
3205 video_id = video.get('videoId')
3206 if not video_id:
3207 continue
3208 title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str)
3209 description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str)
3210 duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str))
3211 view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
3212 view_count = int_or_none(self._search_regex(
3213 r'^(\d+)', re.sub(r'\s', '', view_count_text),
3214 'view count', default=None))
3215 uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
3216 total += 1
3217 yield {
3218 '_type': 'url_transparent',
3219 'ie_key': YoutubeIE.ie_key(),
3220 'id': video_id,
3221 'url': video_id,
3222 'title': title,
3223 'description': description,
3224 'duration': duration,
3225 'view_count': view_count,
3226 'uploader': uploader,
3227 }
3228 if total == n:
3229 return
3230 token = try_get(
3231 slr_contents,
3232 lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
3233 compat_str)
3234 if not token:
3235 break
3236 data['continuation'] = token
3237
3238 def _get_n_results(self, query, n):
3239 """Get a specified number of results for a query"""
3240 return self.playlist_result(self._entries(query, n), query)
3241
3242
3243 class YoutubeSearchDateIE(YoutubeSearchIE):
3244 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
3245 _SEARCH_KEY = 'ytsearchdate'
3246 IE_DESC = 'YouTube.com searches, newest videos first'
3247 _SEARCH_PARAMS = 'CAI%3D'
3248
3249
3250 class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):
3251 IE_DESC = 'YouTube.com search URLs'
3252 IE_NAME = 'youtube:search_url'
3253 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
3254 _SEARCH_DATA = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});'
3255 _TESTS = [{
3256 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
3257 'playlist_mincount': 5,
3258 'info_dict': {
3259 'title': 'youtube-dl test video',
3260 }
3261 }, {
3262 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
3263 'only_matching': True,
3264 }]
3265
3266 def _find_videos_in_json(self, extracted):
3267 videos = []
3268
3269 def _real_find(obj):
3270 if obj is None or isinstance(obj, str):
3271 return
3272
3273 if type(obj) is list:
3274 for elem in obj:
3275 _real_find(elem)
3276
3277 if type(obj) is dict:
3278 if "videoId" in obj:
3279 videos.append(obj)
3280 return
3281
3282 for _, o in obj.items():
3283 _real_find(o)
3284
3285 _real_find(extracted)
3286
3287 return videos
3288
3289 def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page):
3290 search_response = self._parse_json(self._search_regex(self._SEARCH_DATA, page, 'ytInitialData'), None)
3291
3292 result_items = self._find_videos_in_json(search_response)
3293
3294 for renderer in result_items:
3295 video_id = try_get(renderer, lambda x: x['videoId'])
3296 video_title = try_get(renderer, lambda x: x['title']['runs'][0]['text']) or try_get(renderer, lambda x: x['title']['simpleText'])
3297
3298 if video_id is None or video_title is None:
3299 # we do not have a videoRenderer or title extraction broke
3300 continue
3301
3302 video_title = video_title.strip()
3303
3304 try:
3305 idx = ids_in_page.index(video_id)
3306 if video_title and not titles_in_page[idx]:
3307 titles_in_page[idx] = video_title
3308 except ValueError:
3309 ids_in_page.append(video_id)
3310 titles_in_page.append(video_title)
3311
3312 def extract_videos_from_page(self, page):
3313 ids_in_page = []
3314 titles_in_page = []
3315 self.extract_videos_from_page_impl(page, ids_in_page, titles_in_page)
3316 return zip(ids_in_page, titles_in_page)
3317
3318 def _real_extract(self, url):
3319 mobj = re.match(self._VALID_URL, url)
3320 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
3321 webpage = self._download_webpage(url, query)
3322 return self.playlist_result(self._process_page(webpage), playlist_title=query)
3323
3324
3325 class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
3326 IE_DESC = 'YouTube.com (multi-season) shows'
3327 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
3328 IE_NAME = 'youtube:show'
3329 _TESTS = [{
3330 'url': 'https://www.youtube.com/show/airdisasters',
3331 'playlist_mincount': 5,
3332 'info_dict': {
3333 'id': 'airdisasters',
3334 'title': 'Air Disasters',
3335 }
3336 }]
3337
3338 def _real_extract(self, url):
3339 playlist_id = self._match_id(url)
3340 return super(YoutubeShowIE, self)._real_extract(
3341 'https://www.youtube.com/show/%s/playlists' % playlist_id)
3342
3343
3344 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
3345 """
3346 Base class for feed extractors
3347 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
3348 """
3349 _LOGIN_REQUIRED = True
3350 _FEED_DATA = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});'
3351 _YTCFG_DATA = r"ytcfg.set\(({.*?})\)"
3352
3353 @property
3354 def IE_NAME(self):
3355 return 'youtube:%s' % self._FEED_NAME
3356
3357 def _real_initialize(self):
3358 self._login()
3359
3360 def _find_videos_in_json(self, extracted):
3361 videos = []
3362 c = {}
3363
3364 def _real_find(obj):
3365 if obj is None or isinstance(obj, str):
3366 return
3367
3368 if type(obj) is list:
3369 for elem in obj:
3370 _real_find(elem)
3371
3372 if type(obj) is dict:
3373 if "videoId" in obj:
3374 videos.append(obj)
3375 return
3376
3377 if "nextContinuationData" in obj:
3378 c["continuation"] = obj["nextContinuationData"]
3379 return
3380
3381 for _, o in obj.items():
3382 _real_find(o)
3383
3384 _real_find(extracted)
3385
3386 return videos, try_get(c, lambda x: x["continuation"])
3387
3388 def _entries(self, page):
3389 info = []
3390
3391 yt_conf = self._parse_json(self._search_regex(self._YTCFG_DATA, page, 'ytcfg.set', default="null"), None, fatal=False)
3392
3393 search_response = self._parse_json(self._search_regex(self._FEED_DATA, page, 'ytInitialData'), None)
3394
3395 for page_num in itertools.count(1):
3396 video_info, continuation = self._find_videos_in_json(search_response)
3397
3398 new_info = []
3399
3400 for v in video_info:
3401 v_id = try_get(v, lambda x: x['videoId'])
3402 if not v_id:
3403 continue
3404
3405 have_video = False
3406 for old in info:
3407 if old['videoId'] == v_id:
3408 have_video = True
3409 break
3410
3411 if not have_video:
3412 new_info.append(v)
3413
3414 if not new_info:
3415 break
3416
3417 info.extend(new_info)
3418
3419 for video in new_info:
3420 yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['runs'][0]['text']) or try_get(video, lambda x: x['title']['simpleText']))
3421
3422 if not continuation or not yt_conf:
3423 break
3424
3425 search_response = self._download_json(
3426 'https://www.youtube.com/browse_ajax', self._PLAYLIST_TITLE,
3427 'Downloading page #%s' % page_num,
3428 transform_source=uppercase_escape,
3429 query={
3430 "ctoken": try_get(continuation, lambda x: x["continuation"]),
3431 "continuation": try_get(continuation, lambda x: x["continuation"]),
3432 "itct": try_get(continuation, lambda x: x["clickTrackingParams"])
3433 },
3434 headers={
3435 "X-YouTube-Client-Name": try_get(yt_conf, lambda x: x["INNERTUBE_CONTEXT_CLIENT_NAME"]),
3436 "X-YouTube-Client-Version": try_get(yt_conf, lambda x: x["INNERTUBE_CONTEXT_CLIENT_VERSION"]),
3437 "X-Youtube-Identity-Token": try_get(yt_conf, lambda x: x["ID_TOKEN"]),
3438 "X-YouTube-Device": try_get(yt_conf, lambda x: x["DEVICE"]),
3439 "X-YouTube-Page-CL": try_get(yt_conf, lambda x: x["PAGE_CL"]),
3440 "X-YouTube-Page-Label": try_get(yt_conf, lambda x: x["PAGE_BUILD_LABEL"]),
3441 "X-YouTube-Variants-Checksum": try_get(yt_conf, lambda x: x["VARIANTS_CHECKSUM"]),
3442 })
3443
3444 def _real_extract(self, url):
3445 page = self._download_webpage(
3446 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
3447 self._PLAYLIST_TITLE)
3448 return self.playlist_result(
3449 self._entries(page), playlist_title=self._PLAYLIST_TITLE)
3450
3451
3452 class YoutubeWatchLaterIE(YoutubePlaylistIE):
3453 IE_NAME = 'youtube:watchlater'
3454 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
3455 _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'
3456
3457 _TESTS = [{
3458 'url': 'https://www.youtube.com/playlist?list=WL',
3459 'only_matching': True,
3460 }, {
3461 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',
3462 'only_matching': True,
3463 }]
3464
3465 def _real_extract(self, url):
3466 _, video = self._check_download_just_video(url, 'WL')
3467 if video:
3468 return video
3469 _, playlist = self._extract_playlist('WL')
3470 return playlist
3471
3472
3473 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
3474 IE_NAME = 'youtube:favorites'
3475 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
3476 _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
3477 _LOGIN_REQUIRED = True
3478
3479 def _real_extract(self, url):
3480 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
3481 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
3482 return self.url_result(playlist_id, 'YoutubePlaylist')
3483
3484
3485 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3486 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
3487 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'
3488 _FEED_NAME = 'recommended'
3489 _PLAYLIST_TITLE = 'Youtube Recommended videos'
3490
3491
3492 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
3493 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
3494 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
3495 _FEED_NAME = 'subscriptions'
3496 _PLAYLIST_TITLE = 'Youtube Subscriptions'
3497
3498
3499 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
3500 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
3501 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
3502 _FEED_NAME = 'history'
3503 _PLAYLIST_TITLE = 'Youtube History'
3504
3505
3506 class YoutubeTruncatedURLIE(InfoExtractor):
3507 IE_NAME = 'youtube:truncated_url'
3508 IE_DESC = False # Do not list
3509 _VALID_URL = r'''(?x)
3510 (?:https?://)?
3511 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3512 (?:watch\?(?:
3513 feature=[a-z_]+|
3514 annotation_id=annotation_[^&]+|
3515 x-yt-cl=[0-9]+|
3516 hl=[^&]*|
3517 t=[0-9]+
3518 )?
3519 |
3520 attribution_link\?a=[^&]+
3521 )
3522 $
3523 '''
3524
3525 _TESTS = [{
3526 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
3527 'only_matching': True,
3528 }, {
3529 'url': 'https://www.youtube.com/watch?',
3530 'only_matching': True,
3531 }, {
3532 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3533 'only_matching': True,
3534 }, {
3535 'url': 'https://www.youtube.com/watch?feature=foo',
3536 'only_matching': True,
3537 }, {
3538 'url': 'https://www.youtube.com/watch?hl=en-GB',
3539 'only_matching': True,
3540 }, {
3541 'url': 'https://www.youtube.com/watch?t=2372',
3542 'only_matching': True,
3543 }]
3544
3545 def _real_extract(self, url):
3546 raise ExtractorError(
3547 'Did you forget to quote the URL? Remember that & is a meta '
3548 'character in most shells, so you want to put the URL in quotes, '
3549 'like youtube-dl '
3550 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3551 ' or simply youtube-dl BaW_jenozKc .',
3552 expected=True)
3553
3554
3555 class YoutubeTruncatedIDIE(InfoExtractor):
3556 IE_NAME = 'youtube:truncated_id'
3557 IE_DESC = False # Do not list
3558 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
3559
3560 _TESTS = [{
3561 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3562 'only_matching': True,
3563 }]
3564
3565 def _real_extract(self, url):
3566 video_id = self._match_id(url)
3567 raise ExtractorError(
3568 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3569 expected=True)