]> jfr.im git - yt-dlp.git/blob - youtube_dlc/extractor/youtube.py
Merge pull request #68 from peet1993/master
[yt-dlp.git] / youtube_dlc / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5
6 import itertools
7 import json
8 import os.path
9 import random
10 import re
11 import time
12 import traceback
13
14 from .common import InfoExtractor, SearchInfoExtractor
15 from ..jsinterp import JSInterpreter
16 from ..swfinterp import SWFInterpreter
17 from ..compat import (
18 compat_chr,
19 compat_HTTPError,
20 compat_kwargs,
21 compat_parse_qs,
22 compat_urllib_parse_unquote,
23 compat_urllib_parse_unquote_plus,
24 compat_urllib_parse_urlencode,
25 compat_urllib_parse_urlparse,
26 compat_urlparse,
27 compat_str,
28 )
29 from ..utils import (
30 bool_or_none,
31 clean_html,
32 error_to_compat_str,
33 extract_attributes,
34 ExtractorError,
35 float_or_none,
36 get_element_by_attribute,
37 get_element_by_id,
38 int_or_none,
39 mimetype2ext,
40 orderedSet,
41 parse_codecs,
42 parse_count,
43 parse_duration,
44 remove_quotes,
45 remove_start,
46 smuggle_url,
47 str_or_none,
48 str_to_int,
49 try_get,
50 unescapeHTML,
51 unified_strdate,
52 unsmuggle_url,
53 uppercase_escape,
54 url_or_none,
55 urlencode_postdata,
56 )
57
58
59 class YoutubeBaseInfoExtractor(InfoExtractor):
60 """Provide base functions for Youtube extractors"""
61 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
62 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
63
64 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
65 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
66 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
67
68 _NETRC_MACHINE = 'youtube'
69 # If True it will raise an error if no login info is provided
70 _LOGIN_REQUIRED = False
71
72 _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}'
73
74 _YOUTUBE_CLIENT_HEADERS = {
75 'x-youtube-client-name': '1',
76 'x-youtube-client-version': '1.20200609.04.02',
77 }
78
79 def _set_language(self):
80 self._set_cookie(
81 '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
82 # YouTube sets the expire time to about two months
83 expire_time=time.time() + 2 * 30 * 24 * 3600)
84
85 def _ids_to_results(self, ids):
86 return [
87 self.url_result(vid_id, 'Youtube', video_id=vid_id)
88 for vid_id in ids]
89
90 def _login(self):
91 """
92 Attempt to log in to YouTube.
93 True is returned if successful or skipped.
94 False is returned if login failed.
95
96 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
97 """
98 username, password = self._get_login_info()
99 # No authentication to be performed
100 if username is None:
101 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
102 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
103 if self._downloader.params.get('cookiefile') and False: # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
104 self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
105 return True
106
107 login_page = self._download_webpage(
108 self._LOGIN_URL, None,
109 note='Downloading login page',
110 errnote='unable to fetch login page', fatal=False)
111 if login_page is False:
112 return
113
114 login_form = self._hidden_inputs(login_page)
115
116 def req(url, f_req, note, errnote):
117 data = login_form.copy()
118 data.update({
119 'pstMsg': 1,
120 'checkConnection': 'youtube',
121 'checkedDomains': 'youtube',
122 'hl': 'en',
123 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
124 'f.req': json.dumps(f_req),
125 'flowName': 'GlifWebSignIn',
126 'flowEntry': 'ServiceLogin',
127 # TODO: reverse actual botguard identifier generation algo
128 'bgRequest': '["identifier",""]',
129 })
130 return self._download_json(
131 url, None, note=note, errnote=errnote,
132 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
133 fatal=False,
134 data=urlencode_postdata(data), headers={
135 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
136 'Google-Accounts-XSRF': 1,
137 })
138
139 def warn(message):
140 self._downloader.report_warning(message)
141
142 lookup_req = [
143 username,
144 None, [], None, 'US', None, None, 2, False, True,
145 [
146 None, None,
147 [2, 1, None, 1,
148 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
149 None, [], 4],
150 1, [None, None, []], None, None, None, True
151 ],
152 username,
153 ]
154
155 lookup_results = req(
156 self._LOOKUP_URL, lookup_req,
157 'Looking up account info', 'Unable to look up account info')
158
159 if lookup_results is False:
160 return False
161
162 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
163 if not user_hash:
164 warn('Unable to extract user hash')
165 return False
166
167 challenge_req = [
168 user_hash,
169 None, 1, None, [1, None, None, None, [password, None, True]],
170 [
171 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
172 1, [None, None, []], None, None, None, True
173 ]]
174
175 challenge_results = req(
176 self._CHALLENGE_URL, challenge_req,
177 'Logging in', 'Unable to log in')
178
179 if challenge_results is False:
180 return
181
182 login_res = try_get(challenge_results, lambda x: x[0][5], list)
183 if login_res:
184 login_msg = try_get(login_res, lambda x: x[5], compat_str)
185 warn(
186 'Unable to login: %s' % 'Invalid password'
187 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
188 return False
189
190 res = try_get(challenge_results, lambda x: x[0][-1], list)
191 if not res:
192 warn('Unable to extract result entry')
193 return False
194
195 login_challenge = try_get(res, lambda x: x[0][0], list)
196 if login_challenge:
197 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
198 if challenge_str == 'TWO_STEP_VERIFICATION':
199 # SEND_SUCCESS - TFA code has been successfully sent to phone
200 # QUOTA_EXCEEDED - reached the limit of TFA codes
201 status = try_get(login_challenge, lambda x: x[5], compat_str)
202 if status == 'QUOTA_EXCEEDED':
203 warn('Exceeded the limit of TFA codes, try later')
204 return False
205
206 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
207 if not tl:
208 warn('Unable to extract TL')
209 return False
210
211 tfa_code = self._get_tfa_info('2-step verification code')
212
213 if not tfa_code:
214 warn(
215 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
216 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
217 return False
218
219 tfa_code = remove_start(tfa_code, 'G-')
220
221 tfa_req = [
222 user_hash, None, 2, None,
223 [
224 9, None, None, None, None, None, None, None,
225 [None, tfa_code, True, 2]
226 ]]
227
228 tfa_results = req(
229 self._TFA_URL.format(tl), tfa_req,
230 'Submitting TFA code', 'Unable to submit TFA code')
231
232 if tfa_results is False:
233 return False
234
235 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
236 if tfa_res:
237 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
238 warn(
239 'Unable to finish TFA: %s' % 'Invalid TFA code'
240 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
241 return False
242
243 check_cookie_url = try_get(
244 tfa_results, lambda x: x[0][-1][2], compat_str)
245 else:
246 CHALLENGES = {
247 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
248 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
249 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
250 }
251 challenge = CHALLENGES.get(
252 challenge_str,
253 '%s returned error %s.' % (self.IE_NAME, challenge_str))
254 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
255 return False
256 else:
257 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
258
259 if not check_cookie_url:
260 warn('Unable to extract CheckCookie URL')
261 return False
262
263 check_cookie_results = self._download_webpage(
264 check_cookie_url, None, 'Checking cookie', fatal=False)
265
266 if check_cookie_results is False:
267 return False
268
269 if 'https://myaccount.google.com/' not in check_cookie_results:
270 warn('Unable to log in')
271 return False
272
273 return True
274
275 def _download_webpage_handle(self, *args, **kwargs):
276 query = kwargs.get('query', {}).copy()
277 query['disable_polymer'] = 'true'
278 kwargs['query'] = query
279 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
280 *args, **compat_kwargs(kwargs))
281
282 def _real_initialize(self):
283 if self._downloader is None:
284 return
285 self._set_language()
286 if not self._login():
287 return
288
289
290 class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
291 # Extract entries from page with "Load more" button
292 def _entries(self, page, playlist_id):
293 more_widget_html = content_html = page
294 for page_num in itertools.count(1):
295 for entry in self._process_page(content_html):
296 yield entry
297
298 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
299 if not mobj:
300 break
301
302 count = 0
303 retries = 3
304 while count <= retries:
305 try:
306 # Downloading page may result in intermittent 5xx HTTP error
307 # that is usually worked around with a retry
308 more = self._download_json(
309 'https://www.youtube.com/%s' % mobj.group('more'), playlist_id,
310 'Downloading page #%s%s'
311 % (page_num, ' (retry #%d)' % count if count else ''),
312 transform_source=uppercase_escape,
313 headers=self._YOUTUBE_CLIENT_HEADERS)
314 break
315 except ExtractorError as e:
316 if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
317 count += 1
318 if count <= retries:
319 continue
320 raise
321
322 content_html = more['content_html']
323 if not content_html.strip():
324 # Some webpages show a "Load more" button but they don't
325 # have more videos
326 break
327 more_widget_html = more['load_more_widget_html']
328
329
330 class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
331 def _process_page(self, content):
332 for video_id, video_title in self.extract_videos_from_page(content):
333 yield self.url_result(video_id, 'Youtube', video_id, video_title)
334
335 def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page):
336 for mobj in re.finditer(video_re, page):
337 # The link with index 0 is not the first video of the playlist (not sure if still actual)
338 if 'index' in mobj.groupdict() and mobj.group('id') == '0':
339 continue
340 video_id = mobj.group('id')
341 video_title = unescapeHTML(
342 mobj.group('title')) if 'title' in mobj.groupdict() else None
343 if video_title:
344 video_title = video_title.strip()
345 if video_title == '► Play all':
346 video_title = None
347 try:
348 idx = ids_in_page.index(video_id)
349 if video_title and not titles_in_page[idx]:
350 titles_in_page[idx] = video_title
351 except ValueError:
352 ids_in_page.append(video_id)
353 titles_in_page.append(video_title)
354
355 def extract_videos_from_page(self, page):
356 ids_in_page = []
357 titles_in_page = []
358 self.extract_videos_from_page_impl(
359 self._VIDEO_RE, page, ids_in_page, titles_in_page)
360 return zip(ids_in_page, titles_in_page)
361
362
363 class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
364 def _process_page(self, content):
365 for playlist_id in orderedSet(re.findall(
366 r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',
367 content)):
368 yield self.url_result(
369 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
370
371 def _real_extract(self, url):
372 playlist_id = self._match_id(url)
373 webpage = self._download_webpage(url, playlist_id)
374 title = self._og_search_title(webpage, fatal=False)
375 return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
376
377
378 class YoutubeIE(YoutubeBaseInfoExtractor):
379 IE_DESC = 'YouTube.com'
380 _VALID_URL = r"""(?x)^
381 (
382 (?:https?://|//) # http(s):// or protocol-independent URL
383 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
384 (?:www\.)?deturl\.com/www\.youtube\.com/|
385 (?:www\.)?pwnyoutube\.com/|
386 (?:www\.)?hooktube\.com/|
387 (?:www\.)?yourepeat\.com/|
388 tube\.majestyc\.net/|
389 # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
390 (?:(?:www|dev)\.)?invidio\.us/|
391 (?:(?:www|no)\.)?invidiou\.sh/|
392 (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/|
393 (?:www\.)?invidious\.kabi\.tk/|
394 (?:www\.)?invidious\.13ad\.de/|
395 (?:www\.)?invidious\.mastodon\.host/|
396 (?:www\.)?invidious\.nixnet\.xyz/|
397 (?:www\.)?invidious\.drycat\.fr/|
398 (?:www\.)?tube\.poal\.co/|
399 (?:www\.)?vid\.wxzm\.sx/|
400 (?:www\.)?yewtu\.be/|
401 (?:www\.)?yt\.elukerio\.org/|
402 (?:www\.)?yt\.lelux\.fi/|
403 (?:www\.)?invidious\.ggc-project\.de/|
404 (?:www\.)?yt\.maisputain\.ovh/|
405 (?:www\.)?invidious\.13ad\.de/|
406 (?:www\.)?invidious\.toot\.koeln/|
407 (?:www\.)?invidious\.fdn\.fr/|
408 (?:www\.)?watch\.nettohikari\.com/|
409 (?:www\.)?kgg2m7yk5aybusll\.onion/|
410 (?:www\.)?qklhadlycap4cnod\.onion/|
411 (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
412 (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
413 (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
414 (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
415 (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
416 (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
417 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
418 (?:.*?\#/)? # handle anchor (#/) redirect urls
419 (?: # the various things that can precede the ID:
420 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
421 |(?: # or the v= param in all its forms
422 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
423 (?:\?|\#!?) # the params delimiter ? or # or #!
424 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
425 v=
426 )
427 ))
428 |(?:
429 youtu\.be| # just youtu.be/xxxx
430 vid\.plus| # or vid.plus/xxxx
431 zwearz\.com/watch| # or zwearz.com/watch/xxxx
432 )/
433 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
434 )
435 )? # all until now is optional -> you can pass the naked ID
436 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
437 (?!.*?\blist=
438 (?:
439 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
440 WL # WL are handled by the watch later IE
441 )
442 )
443 (?(1).+)? # if we found the ID, everything can follow
444 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
445 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
446 _PLAYER_INFO_RE = (
447 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P<ext>[a-z]+)$',
448 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.(?P<ext>[a-z]+)$',
449 )
450 _formats = {
451 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
452 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
453 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
454 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
455 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
456 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
457 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
458 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
459 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
460 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
461 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
462 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
463 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
464 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
465 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
466 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
467 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
468 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
469
470
471 # 3D videos
472 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
473 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
474 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
475 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
476 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
477 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
478 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
479
480 # Apple HTTP Live Streaming
481 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
482 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
483 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
484 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
485 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
486 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
487 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
488 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
489
490 # DASH mp4 video
491 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
492 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
493 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
494 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
495 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
496 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
497 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
498 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
499 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
500 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
501 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
502 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
503
504 # Dash mp4 audio
505 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
506 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
507 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
508 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
509 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
510 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
511 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
512
513 # Dash webm
514 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
515 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
516 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
517 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
518 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
519 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
520 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
521 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
522 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
523 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
524 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
525 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
526 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
527 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
528 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
529 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
530 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
531 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
532 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
533 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
534 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
535 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
536
537 # Dash webm audio
538 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
539 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
540
541 # Dash webm audio with opus inside
542 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
543 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
544 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
545
546 # RTMP (unnamed)
547 '_rtmp': {'protocol': 'rtmp'},
548
549 # av01 video only formats sometimes served with "unknown" codecs
550 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
551 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
552 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
553 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
554 }
555 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
556
557 _GEO_BYPASS = False
558
559 IE_NAME = 'youtube'
560 _TESTS = [
561 {
562 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
563 'info_dict': {
564 'id': 'BaW_jenozKc',
565 'ext': 'mp4',
566 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
567 'uploader': 'Philipp Hagemeister',
568 'uploader_id': 'phihag',
569 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
570 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
571 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
572 'upload_date': '20121002',
573 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
574 'categories': ['Science & Technology'],
575 'tags': ['youtube-dl'],
576 'duration': 10,
577 'view_count': int,
578 'like_count': int,
579 'dislike_count': int,
580 'start_time': 1,
581 'end_time': 9,
582 }
583 },
584 {
585 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
586 'note': 'Embed-only video (#1746)',
587 'info_dict': {
588 'id': 'yZIXLfi8CZQ',
589 'ext': 'mp4',
590 'upload_date': '20120608',
591 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
592 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
593 'uploader': 'SET India',
594 'uploader_id': 'setindia',
595 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
596 'age_limit': 18,
597 }
598 },
599 {
600 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
601 'note': 'Use the first video ID in the URL',
602 'info_dict': {
603 'id': 'BaW_jenozKc',
604 'ext': 'mp4',
605 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
606 'uploader': 'Philipp Hagemeister',
607 'uploader_id': 'phihag',
608 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
609 'upload_date': '20121002',
610 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
611 'categories': ['Science & Technology'],
612 'tags': ['youtube-dl'],
613 'duration': 10,
614 'view_count': int,
615 'like_count': int,
616 'dislike_count': int,
617 },
618 'params': {
619 'skip_download': True,
620 },
621 },
622 {
623 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
624 'note': '256k DASH audio (format 141) via DASH manifest',
625 'info_dict': {
626 'id': 'a9LDPn-MO4I',
627 'ext': 'm4a',
628 'upload_date': '20121002',
629 'uploader_id': '8KVIDEO',
630 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
631 'description': '',
632 'uploader': '8KVIDEO',
633 'title': 'UHDTV TEST 8K VIDEO.mp4'
634 },
635 'params': {
636 'youtube_include_dash_manifest': True,
637 'format': '141',
638 },
639 'skip': 'format 141 not served anymore',
640 },
641 # Controversy video
642 {
643 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
644 'info_dict': {
645 'id': 'T4XJQO3qol8',
646 'ext': 'mp4',
647 'duration': 219,
648 'upload_date': '20100909',
649 'uploader': 'Amazing Atheist',
650 'uploader_id': 'TheAmazingAtheist',
651 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
652 'title': 'Burning Everyone\'s Koran',
653 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
654 }
655 },
656 # Normal age-gate video (embed allowed)
657 {
658 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
659 'info_dict': {
660 'id': 'HtVdAasjOgU',
661 'ext': 'mp4',
662 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
663 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
664 'duration': 142,
665 'uploader': 'The Witcher',
666 'uploader_id': 'WitcherGame',
667 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
668 'upload_date': '20140605',
669 'age_limit': 18,
670 },
671 },
672 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
673 {
674 'url': 'lqQg6PlCWgI',
675 'info_dict': {
676 'id': 'lqQg6PlCWgI',
677 'ext': 'mp4',
678 'duration': 6085,
679 'upload_date': '20150827',
680 'uploader_id': 'olympic',
681 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
682 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
683 'uploader': 'Olympic',
684 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
685 },
686 'params': {
687 'skip_download': 'requires avconv',
688 }
689 },
690 # Non-square pixels
691 {
692 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
693 'info_dict': {
694 'id': '_b-2C3KPAM0',
695 'ext': 'mp4',
696 'stretched_ratio': 16 / 9.,
697 'duration': 85,
698 'upload_date': '20110310',
699 'uploader_id': 'AllenMeow',
700 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
701 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
702 'uploader': '孫ᄋᄅ',
703 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
704 },
705 },
706 # url_encoded_fmt_stream_map is empty string
707 {
708 'url': 'qEJwOuvDf7I',
709 'info_dict': {
710 'id': 'qEJwOuvDf7I',
711 'ext': 'webm',
712 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
713 'description': '',
714 'upload_date': '20150404',
715 'uploader_id': 'spbelect',
716 'uploader': 'Наблюдатели Петербурга',
717 },
718 'params': {
719 'skip_download': 'requires avconv',
720 },
721 'skip': 'This live event has ended.',
722 },
723 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
724 {
725 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
726 'info_dict': {
727 'id': 'FIl7x6_3R5Y',
728 'ext': 'webm',
729 'title': 'md5:7b81415841e02ecd4313668cde88737a',
730 'description': 'md5:116377fd2963b81ec4ce64b542173306',
731 'duration': 220,
732 'upload_date': '20150625',
733 'uploader_id': 'dorappi2000',
734 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
735 'uploader': 'dorappi2000',
736 'formats': 'mincount:31',
737 },
738 'skip': 'not actual anymore',
739 },
740 # DASH manifest with segment_list
741 {
742 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
743 'md5': '8ce563a1d667b599d21064e982ab9e31',
744 'info_dict': {
745 'id': 'CsmdDsKjzN8',
746 'ext': 'mp4',
747 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
748 'uploader': 'Airtek',
749 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
750 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
751 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
752 },
753 'params': {
754 'youtube_include_dash_manifest': True,
755 'format': '135', # bestvideo
756 },
757 'skip': 'This live event has ended.',
758 },
759 {
760 # Multifeed videos (multiple cameras), URL is for Main Camera
761 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
762 'info_dict': {
763 'id': 'jqWvoWXjCVs',
764 'title': 'teamPGP: Rocket League Noob Stream',
765 'description': 'md5:dc7872fb300e143831327f1bae3af010',
766 },
767 'playlist': [{
768 'info_dict': {
769 'id': 'jqWvoWXjCVs',
770 'ext': 'mp4',
771 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
772 'description': 'md5:dc7872fb300e143831327f1bae3af010',
773 'duration': 7335,
774 'upload_date': '20150721',
775 'uploader': 'Beer Games Beer',
776 'uploader_id': 'beergamesbeer',
777 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
778 'license': 'Standard YouTube License',
779 },
780 }, {
781 'info_dict': {
782 'id': '6h8e8xoXJzg',
783 'ext': 'mp4',
784 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
785 'description': 'md5:dc7872fb300e143831327f1bae3af010',
786 'duration': 7337,
787 'upload_date': '20150721',
788 'uploader': 'Beer Games Beer',
789 'uploader_id': 'beergamesbeer',
790 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
791 'license': 'Standard YouTube License',
792 },
793 }, {
794 'info_dict': {
795 'id': 'PUOgX5z9xZw',
796 'ext': 'mp4',
797 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
798 'description': 'md5:dc7872fb300e143831327f1bae3af010',
799 'duration': 7337,
800 'upload_date': '20150721',
801 'uploader': 'Beer Games Beer',
802 'uploader_id': 'beergamesbeer',
803 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
804 'license': 'Standard YouTube License',
805 },
806 }, {
807 'info_dict': {
808 'id': 'teuwxikvS5k',
809 'ext': 'mp4',
810 'title': 'teamPGP: Rocket League Noob Stream (zim)',
811 'description': 'md5:dc7872fb300e143831327f1bae3af010',
812 'duration': 7334,
813 'upload_date': '20150721',
814 'uploader': 'Beer Games Beer',
815 'uploader_id': 'beergamesbeer',
816 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
817 'license': 'Standard YouTube License',
818 },
819 }],
820 'params': {
821 'skip_download': True,
822 },
823 'skip': 'This video is not available.',
824 },
825 {
826 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
827 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
828 'info_dict': {
829 'id': 'gVfLd0zydlo',
830 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
831 },
832 'playlist_count': 2,
833 'skip': 'Not multifeed anymore',
834 },
835 {
836 'url': 'https://vid.plus/FlRa-iH7PGw',
837 'only_matching': True,
838 },
839 {
840 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
841 'only_matching': True,
842 },
843 {
844 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
845 # Also tests cut-off URL expansion in video description (see
846 # https://github.com/ytdl-org/youtube-dl/issues/1892,
847 # https://github.com/ytdl-org/youtube-dl/issues/8164)
848 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
849 'info_dict': {
850 'id': 'lsguqyKfVQg',
851 'ext': 'mp4',
852 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
853 'alt_title': 'Dark Walk - Position Music',
854 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
855 'duration': 133,
856 'upload_date': '20151119',
857 'uploader_id': 'IronSoulElf',
858 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
859 'uploader': 'IronSoulElf',
860 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
861 'track': 'Dark Walk - Position Music',
862 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
863 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
864 },
865 'params': {
866 'skip_download': True,
867 },
868 },
869 {
870 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
871 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
872 'only_matching': True,
873 },
874 {
875 # Video with yt:stretch=17:0
876 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
877 'info_dict': {
878 'id': 'Q39EVAstoRM',
879 'ext': 'mp4',
880 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
881 'description': 'md5:ee18a25c350637c8faff806845bddee9',
882 'upload_date': '20151107',
883 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
884 'uploader': 'CH GAMER DROID',
885 },
886 'params': {
887 'skip_download': True,
888 },
889 'skip': 'This video does not exist.',
890 },
891 {
892 # Video licensed under Creative Commons
893 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
894 'info_dict': {
895 'id': 'M4gD1WSo5mA',
896 'ext': 'mp4',
897 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
898 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
899 'duration': 721,
900 'upload_date': '20150127',
901 'uploader_id': 'BerkmanCenter',
902 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
903 'uploader': 'The Berkman Klein Center for Internet & Society',
904 'license': 'Creative Commons Attribution license (reuse allowed)',
905 },
906 'params': {
907 'skip_download': True,
908 },
909 },
910 {
911 # Channel-like uploader_url
912 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
913 'info_dict': {
914 'id': 'eQcmzGIKrzg',
915 'ext': 'mp4',
916 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
917 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
918 'duration': 4060,
919 'upload_date': '20151119',
920 'uploader': 'Bernie Sanders',
921 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
922 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
923 'license': 'Creative Commons Attribution license (reuse allowed)',
924 },
925 'params': {
926 'skip_download': True,
927 },
928 },
929 {
930 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
931 'only_matching': True,
932 },
933 {
934 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
935 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
936 'only_matching': True,
937 },
938 {
939 # Rental video preview
940 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
941 'info_dict': {
942 'id': 'uGpuVWrhIzE',
943 'ext': 'mp4',
944 'title': 'Piku - Trailer',
945 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
946 'upload_date': '20150811',
947 'uploader': 'FlixMatrix',
948 'uploader_id': 'FlixMatrixKaravan',
949 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
950 'license': 'Standard YouTube License',
951 },
952 'params': {
953 'skip_download': True,
954 },
955 'skip': 'This video is not available.',
956 },
957 {
958 # YouTube Red video with episode data
959 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
960 'info_dict': {
961 'id': 'iqKdEhx-dD4',
962 'ext': 'mp4',
963 'title': 'Isolation - Mind Field (Ep 1)',
964 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
965 'duration': 2085,
966 'upload_date': '20170118',
967 'uploader': 'Vsauce',
968 'uploader_id': 'Vsauce',
969 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
970 'series': 'Mind Field',
971 'season_number': 1,
972 'episode_number': 1,
973 },
974 'params': {
975 'skip_download': True,
976 },
977 'expected_warnings': [
978 'Skipping DASH manifest',
979 ],
980 },
981 {
982 # The following content has been identified by the YouTube community
983 # as inappropriate or offensive to some audiences.
984 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
985 'info_dict': {
986 'id': '6SJNVb0GnPI',
987 'ext': 'mp4',
988 'title': 'Race Differences in Intelligence',
989 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
990 'duration': 965,
991 'upload_date': '20140124',
992 'uploader': 'New Century Foundation',
993 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
994 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
995 },
996 'params': {
997 'skip_download': True,
998 },
999 },
1000 {
1001 # itag 212
1002 'url': '1t24XAntNCY',
1003 'only_matching': True,
1004 },
1005 {
1006 # geo restricted to JP
1007 'url': 'sJL6WA-aGkQ',
1008 'only_matching': True,
1009 },
1010 {
1011 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
1012 'only_matching': True,
1013 },
1014 {
1015 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1016 'only_matching': True,
1017 },
1018 {
1019 # DRM protected
1020 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1021 'only_matching': True,
1022 },
1023 {
1024 # Video with unsupported adaptive stream type formats
1025 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1026 'info_dict': {
1027 'id': 'Z4Vy8R84T1U',
1028 'ext': 'mp4',
1029 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1030 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1031 'duration': 433,
1032 'upload_date': '20130923',
1033 'uploader': 'Amelia Putri Harwita',
1034 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1035 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1036 'formats': 'maxcount:10',
1037 },
1038 'params': {
1039 'skip_download': True,
1040 'youtube_include_dash_manifest': False,
1041 },
1042 'skip': 'not actual anymore',
1043 },
1044 {
1045 # Youtube Music Auto-generated description
1046 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1047 'info_dict': {
1048 'id': 'MgNrAu2pzNs',
1049 'ext': 'mp4',
1050 'title': 'Voyeur Girl',
1051 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1052 'upload_date': '20190312',
1053 'uploader': 'Stephen - Topic',
1054 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
1055 'artist': 'Stephen',
1056 'track': 'Voyeur Girl',
1057 'album': 'it\'s too much love to know my dear',
1058 'release_date': '20190313',
1059 'release_year': 2019,
1060 },
1061 'params': {
1062 'skip_download': True,
1063 },
1064 },
1065 {
1066 # Youtube Music Auto-generated description
1067 # Retrieve 'artist' field from 'Artist:' in video description
1068 # when it is present on youtube music video
1069 'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY',
1070 'info_dict': {
1071 'id': 'k0jLE7tTwjY',
1072 'ext': 'mp4',
1073 'title': 'Latch Feat. Sam Smith',
1074 'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335',
1075 'upload_date': '20150110',
1076 'uploader': 'Various Artists - Topic',
1077 'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w',
1078 'artist': 'Disclosure',
1079 'track': 'Latch Feat. Sam Smith',
1080 'album': 'Latch Featuring Sam Smith',
1081 'release_date': '20121008',
1082 'release_year': 2012,
1083 },
1084 'params': {
1085 'skip_download': True,
1086 },
1087 },
1088 {
1089 # Youtube Music Auto-generated description
1090 # handle multiple artists on youtube music video
1091 'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA',
1092 'info_dict': {
1093 'id': '74qn0eJSjpA',
1094 'ext': 'mp4',
1095 'title': 'Eastside',
1096 'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2',
1097 'upload_date': '20180710',
1098 'uploader': 'Benny Blanco - Topic',
1099 'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A',
1100 'artist': 'benny blanco, Halsey, Khalid',
1101 'track': 'Eastside',
1102 'album': 'Eastside',
1103 'release_date': '20180713',
1104 'release_year': 2018,
1105 },
1106 'params': {
1107 'skip_download': True,
1108 },
1109 },
1110 {
1111 # Youtube Music Auto-generated description
1112 # handle youtube music video with release_year and no release_date
1113 'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M',
1114 'info_dict': {
1115 'id': '-hcAI0g-f5M',
1116 'ext': 'mp4',
1117 'title': 'Put It On Me',
1118 'description': 'md5:f6422397c07c4c907c6638e1fee380a5',
1119 'upload_date': '20180426',
1120 'uploader': 'Matt Maeson - Topic',
1121 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ',
1122 'artist': 'Matt Maeson',
1123 'track': 'Put It On Me',
1124 'album': 'The Hearse',
1125 'release_date': None,
1126 'release_year': 2018,
1127 },
1128 'params': {
1129 'skip_download': True,
1130 },
1131 },
1132 {
1133 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1134 'only_matching': True,
1135 },
1136 {
1137 # invalid -> valid video id redirection
1138 'url': 'DJztXj2GPfl',
1139 'info_dict': {
1140 'id': 'DJztXj2GPfk',
1141 'ext': 'mp4',
1142 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1143 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1144 'upload_date': '20090125',
1145 'uploader': 'Prochorowka',
1146 'uploader_id': 'Prochorowka',
1147 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1148 'artist': 'Panjabi MC',
1149 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1150 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1151 },
1152 'params': {
1153 'skip_download': True,
1154 },
1155 },
1156 {
1157 # empty description results in an empty string
1158 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1159 'info_dict': {
1160 'id': 'x41yOUIvK2k',
1161 'ext': 'mp4',
1162 'title': 'IMG 3456',
1163 'description': '',
1164 'upload_date': '20170613',
1165 'uploader_id': 'ElevageOrVert',
1166 'uploader': 'ElevageOrVert',
1167 },
1168 'params': {
1169 'skip_download': True,
1170 },
1171 },
1172 ]
1173
1174 def __init__(self, *args, **kwargs):
1175 super(YoutubeIE, self).__init__(*args, **kwargs)
1176 self._player_cache = {}
1177
1178 def report_video_info_webpage_download(self, video_id):
1179 """Report attempt to download video info webpage."""
1180 self.to_screen('%s: Downloading video info webpage' % video_id)
1181
1182 def report_information_extraction(self, video_id):
1183 """Report attempt to extract video information."""
1184 self.to_screen('%s: Extracting video information' % video_id)
1185
1186 def report_unavailable_format(self, video_id, format):
1187 """Report extracted video URL."""
1188 self.to_screen('%s: Format %s not available' % (video_id, format))
1189
1190 def report_rtmp_download(self):
1191 """Indicate the download will use the RTMP protocol."""
1192 self.to_screen('RTMP download detected')
1193
1194 def _signature_cache_id(self, example_sig):
1195 """ Return a string representation of a signature """
1196 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
1197
1198 @classmethod
1199 def _extract_player_info(cls, player_url):
1200 for player_re in cls._PLAYER_INFO_RE:
1201 id_m = re.search(player_re, player_url)
1202 if id_m:
1203 break
1204 else:
1205 raise ExtractorError('Cannot identify player %r' % player_url)
1206 return id_m.group('ext'), id_m.group('id')
1207
1208 def _extract_signature_function(self, video_id, player_url, example_sig):
1209 player_type, player_id = self._extract_player_info(player_url)
1210
1211 # Read from filesystem cache
1212 func_id = '%s_%s_%s' % (
1213 player_type, player_id, self._signature_cache_id(example_sig))
1214 assert os.path.basename(func_id) == func_id
1215
1216 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
1217 if cache_spec is not None:
1218 return lambda s: ''.join(s[i] for i in cache_spec)
1219
1220 download_note = (
1221 'Downloading player %s' % player_url
1222 if self._downloader.params.get('verbose') else
1223 'Downloading %s player %s' % (player_type, player_id)
1224 )
1225 if player_type == 'js':
1226 code = self._download_webpage(
1227 player_url, video_id,
1228 note=download_note,
1229 errnote='Download of %s failed' % player_url)
1230 res = self._parse_sig_js(code)
1231 elif player_type == 'swf':
1232 urlh = self._request_webpage(
1233 player_url, video_id,
1234 note=download_note,
1235 errnote='Download of %s failed' % player_url)
1236 code = urlh.read()
1237 res = self._parse_sig_swf(code)
1238 else:
1239 assert False, 'Invalid player type %r' % player_type
1240
1241 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1242 cache_res = res(test_string)
1243 cache_spec = [ord(c) for c in cache_res]
1244
1245 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
1246 return res
1247
1248 def _print_sig_code(self, func, example_sig):
1249 def gen_sig_code(idxs):
1250 def _genslice(start, end, step):
1251 starts = '' if start == 0 else str(start)
1252 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
1253 steps = '' if step == 1 else (':%d' % step)
1254 return 's[%s%s%s]' % (starts, ends, steps)
1255
1256 step = None
1257 # Quelch pyflakes warnings - start will be set when step is set
1258 start = '(Never used)'
1259 for i, prev in zip(idxs[1:], idxs[:-1]):
1260 if step is not None:
1261 if i - prev == step:
1262 continue
1263 yield _genslice(start, prev, step)
1264 step = None
1265 continue
1266 if i - prev in [-1, 1]:
1267 step = i - prev
1268 start = prev
1269 continue
1270 else:
1271 yield 's[%d]' % prev
1272 if step is None:
1273 yield 's[%d]' % i
1274 else:
1275 yield _genslice(start, i, step)
1276
1277 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1278 cache_res = func(test_string)
1279 cache_spec = [ord(c) for c in cache_res]
1280 expr_code = ' + '.join(gen_sig_code(cache_spec))
1281 signature_id_tuple = '(%s)' % (
1282 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
1283 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
1284 ' return %s\n') % (signature_id_tuple, expr_code)
1285 self.to_screen('Extracted signature function:\n' + code)
1286
1287 def _parse_sig_js(self, jscode):
1288 funcname = self._search_regex(
1289 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1290 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1291 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1292 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1293 # Obsolete patterns
1294 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1295 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
1296 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1297 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1298 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1299 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1300 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1301 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
1302 jscode, 'Initial JS player signature function name', group='sig')
1303
1304 jsi = JSInterpreter(jscode)
1305 initial_function = jsi.extract_function(funcname)
1306 return lambda s: initial_function([s])
1307
1308 def _parse_sig_swf(self, file_contents):
1309 swfi = SWFInterpreter(file_contents)
1310 TARGET_CLASSNAME = 'SignatureDecipher'
1311 searched_class = swfi.extract_class(TARGET_CLASSNAME)
1312 initial_function = swfi.extract_function(searched_class, 'decipher')
1313 return lambda s: initial_function([s])
1314
1315 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1316 """Turn the encrypted s field into a working signature"""
1317
1318 if player_url is None:
1319 raise ExtractorError('Cannot decrypt signature without player_url')
1320
1321 if player_url.startswith('//'):
1322 player_url = 'https:' + player_url
1323 elif not re.match(r'https?://', player_url):
1324 player_url = compat_urlparse.urljoin(
1325 'https://www.youtube.com', player_url)
1326 try:
1327 player_id = (player_url, self._signature_cache_id(s))
1328 if player_id not in self._player_cache:
1329 func = self._extract_signature_function(
1330 video_id, player_url, s
1331 )
1332 self._player_cache[player_id] = func
1333 func = self._player_cache[player_id]
1334 if self._downloader.params.get('youtube_print_sig_code'):
1335 self._print_sig_code(func, s)
1336 return func(s)
1337 except Exception as e:
1338 tb = traceback.format_exc()
1339 raise ExtractorError(
1340 'Signature extraction failed: ' + tb, cause=e)
1341
1342 def _get_subtitles(self, video_id, webpage, has_live_chat_replay):
1343 try:
1344 subs_doc = self._download_xml(
1345 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1346 video_id, note=False)
1347 except ExtractorError as err:
1348 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
1349 return {}
1350
1351 sub_lang_list = {}
1352 for track in subs_doc.findall('track'):
1353 lang = track.attrib['lang_code']
1354 if lang in sub_lang_list:
1355 continue
1356 sub_formats = []
1357 for ext in self._SUBTITLE_FORMATS:
1358 params = compat_urllib_parse_urlencode({
1359 'lang': lang,
1360 'v': video_id,
1361 'fmt': ext,
1362 'name': track.attrib['name'].encode('utf-8'),
1363 })
1364 sub_formats.append({
1365 'url': 'https://www.youtube.com/api/timedtext?' + params,
1366 'ext': ext,
1367 })
1368 sub_lang_list[lang] = sub_formats
1369 if has_live_chat_replay:
1370 sub_lang_list['live_chat'] = [
1371 {
1372 'video_id': video_id,
1373 'ext': 'json',
1374 'protocol': 'youtube_live_chat_replay',
1375 },
1376 ]
1377 if not sub_lang_list:
1378 self._downloader.report_warning('video doesn\'t have subtitles')
1379 return {}
1380 return sub_lang_list
1381
1382 def _get_ytplayer_config(self, video_id, webpage):
1383 patterns = (
1384 # User data may contain arbitrary character sequences that may affect
1385 # JSON extraction with regex, e.g. when '};' is contained the second
1386 # regex won't capture the whole JSON. Yet working around by trying more
1387 # concrete regex first keeping in mind proper quoted string handling
1388 # to be implemented in future that will replace this workaround (see
1389 # https://github.com/ytdl-org/youtube-dl/issues/7468,
1390 # https://github.com/ytdl-org/youtube-dl/pull/7599)
1391 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1392 r';ytplayer\.config\s*=\s*({.+?});',
1393 r'ytInitialPlayerResponse\s*=\s*({.+?});var meta'
1394 )
1395 config = self._search_regex(
1396 patterns, webpage, 'ytplayer.config', default=None)
1397 if config:
1398 return self._parse_json(
1399 uppercase_escape(config), video_id, fatal=False)
1400
1401 def _get_yt_initial_data(self, video_id, webpage):
1402 config = self._search_regex(
1403 (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
1404 r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
1405 webpage, 'ytInitialData', default=None)
1406 if config:
1407 return self._parse_json(
1408 uppercase_escape(config), video_id, fatal=False)
1409
1410 def _get_music_metadata_from_yt_initial(self, yt_initial):
1411 music_metadata = []
1412 key_map = {
1413 'Album': 'album',
1414 'Artist': 'artist',
1415 'Song': 'track'
1416 }
1417 contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'])
1418 if type(contents) is list:
1419 for content in contents:
1420 music_track = {}
1421 if type(content) is not dict:
1422 continue
1423 videoSecondaryInfoRenderer = try_get(content, lambda x: x['videoSecondaryInfoRenderer'])
1424 if type(videoSecondaryInfoRenderer) is not dict:
1425 continue
1426 rows = try_get(videoSecondaryInfoRenderer, lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'])
1427 if type(rows) is not list:
1428 continue
1429 for row in rows:
1430 metadataRowRenderer = try_get(row, lambda x: x['metadataRowRenderer'])
1431 if type(metadataRowRenderer) is not dict:
1432 continue
1433 key = try_get(metadataRowRenderer, lambda x: x['title']['simpleText'])
1434 value = try_get(metadataRowRenderer, lambda x: x['contents'][0]['simpleText']) or \
1435 try_get(metadataRowRenderer, lambda x: x['contents'][0]['runs'][0]['text'])
1436 if type(key) is not str or type(value) is not str:
1437 continue
1438 if key in key_map:
1439 if key_map[key] in music_track:
1440 # we've started on a new track
1441 music_metadata.append(music_track)
1442 music_track = {}
1443 music_track[key_map[key]] = value
1444 if len(music_track.keys()):
1445 music_metadata.append(music_track)
1446 return music_metadata
1447
1448 def _get_automatic_captions(self, video_id, webpage):
1449 """We need the webpage for getting the captions url, pass it as an
1450 argument to speed up the process."""
1451 self.to_screen('%s: Looking for automatic captions' % video_id)
1452 player_config = self._get_ytplayer_config(video_id, webpage)
1453 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
1454 if not player_config:
1455 self._downloader.report_warning(err_msg)
1456 return {}
1457 try:
1458 if "args" in player_config and "ttsurl" in player_config["args"]:
1459 args = player_config['args']
1460 caption_url = args['ttsurl']
1461 timestamp = args['timestamp']
1462
1463 # We get the available subtitles
1464 list_params = compat_urllib_parse_urlencode({
1465 'type': 'list',
1466 'tlangs': 1,
1467 'asrs': 1,
1468 })
1469 list_url = caption_url + '&' + list_params
1470 caption_list = self._download_xml(list_url, video_id)
1471 original_lang_node = caption_list.find('track')
1472 if original_lang_node is None:
1473 self._downloader.report_warning('Video doesn\'t have automatic captions')
1474 return {}
1475 original_lang = original_lang_node.attrib['lang_code']
1476 caption_kind = original_lang_node.attrib.get('kind', '')
1477
1478 sub_lang_list = {}
1479 for lang_node in caption_list.findall('target'):
1480 sub_lang = lang_node.attrib['lang_code']
1481 sub_formats = []
1482 for ext in self._SUBTITLE_FORMATS:
1483 params = compat_urllib_parse_urlencode({
1484 'lang': original_lang,
1485 'tlang': sub_lang,
1486 'fmt': ext,
1487 'ts': timestamp,
1488 'kind': caption_kind,
1489 })
1490 sub_formats.append({
1491 'url': caption_url + '&' + params,
1492 'ext': ext,
1493 })
1494 sub_lang_list[sub_lang] = sub_formats
1495 return sub_lang_list
1496
1497 def make_captions(sub_url, sub_langs):
1498 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1499 caption_qs = compat_parse_qs(parsed_sub_url.query)
1500 captions = {}
1501 for sub_lang in sub_langs:
1502 sub_formats = []
1503 for ext in self._SUBTITLE_FORMATS:
1504 caption_qs.update({
1505 'tlang': [sub_lang],
1506 'fmt': [ext],
1507 })
1508 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1509 query=compat_urllib_parse_urlencode(caption_qs, True)))
1510 sub_formats.append({
1511 'url': sub_url,
1512 'ext': ext,
1513 })
1514 captions[sub_lang] = sub_formats
1515 return captions
1516
1517 # New captions format as of 22.06.2017
1518 if "args" in player_config:
1519 player_response = player_config["args"].get('player_response')
1520 else:
1521 # New player system (ytInitialPlayerResponse) as of October 2020
1522 player_response = player_config
1523
1524 if player_response:
1525 if isinstance(player_response, compat_str):
1526 player_response = self._parse_json(
1527 player_response, video_id, fatal=False)
1528
1529 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1530 caption_tracks = renderer['captionTracks']
1531 for caption_track in caption_tracks:
1532 if 'kind' not in caption_track:
1533 # not an automatic transcription
1534 continue
1535 base_url = caption_track['baseUrl']
1536 sub_lang_list = []
1537 for lang in renderer['translationLanguages']:
1538 lang_code = lang.get('languageCode')
1539 if lang_code:
1540 sub_lang_list.append(lang_code)
1541 return make_captions(base_url, sub_lang_list)
1542
1543 self._downloader.report_warning("Couldn't find automatic captions for %s" % video_id)
1544 return {}
1545
1546 if "args" in player_config:
1547 args = player_config["args"]
1548
1549 # Some videos don't provide ttsurl but rather caption_tracks and
1550 # caption_translation_languages (e.g. 20LmZk1hakA)
1551 # Does not used anymore as of 22.06.2017
1552 caption_tracks = args['caption_tracks']
1553 caption_translation_languages = args['caption_translation_languages']
1554 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
1555 sub_lang_list = []
1556 for lang in caption_translation_languages.split(','):
1557 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1558 sub_lang = lang_qs.get('lc', [None])[0]
1559 if sub_lang:
1560 sub_lang_list.append(sub_lang)
1561 return make_captions(caption_url, sub_lang_list)
1562 # An extractor error can be raise by the download process if there are
1563 # no automatic captions but there are subtitles
1564 except (KeyError, IndexError, ExtractorError):
1565 self._downloader.report_warning(err_msg)
1566 return {}
1567
1568 def _mark_watched(self, video_id, video_info, player_response):
1569 playback_url = url_or_none(try_get(
1570 player_response,
1571 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
1572 video_info, lambda x: x['videostats_playback_base_url'][0]))
1573 if not playback_url:
1574 return
1575 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1576 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1577
1578 # cpn generation algorithm is reverse engineered from base.js.
1579 # In fact it works even with dummy cpn.
1580 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1581 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1582
1583 qs.update({
1584 'ver': ['2'],
1585 'cpn': [cpn],
1586 })
1587 playback_url = compat_urlparse.urlunparse(
1588 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
1589
1590 self._download_webpage(
1591 playback_url, video_id, 'Marking watched',
1592 'Unable to mark watched', fatal=False)
1593
1594 @staticmethod
1595 def _extract_urls(webpage):
1596 # Embedded YouTube player
1597 entries = [
1598 unescapeHTML(mobj.group('url'))
1599 for mobj in re.finditer(r'''(?x)
1600 (?:
1601 <iframe[^>]+?src=|
1602 data-video-url=|
1603 <embed[^>]+?src=|
1604 embedSWF\(?:\s*|
1605 <object[^>]+data=|
1606 new\s+SWFObject\(
1607 )
1608 (["\'])
1609 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1610 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
1611 \1''', webpage)]
1612
1613 # lazyYT YouTube embed
1614 entries.extend(list(map(
1615 unescapeHTML,
1616 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1617
1618 # Wordpress "YouTube Video Importer" plugin
1619 matches = re.findall(r'''(?x)<div[^>]+
1620 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1621 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1622 entries.extend(m[-1] for m in matches)
1623
1624 return entries
1625
1626 @staticmethod
1627 def _extract_url(webpage):
1628 urls = YoutubeIE._extract_urls(webpage)
1629 return urls[0] if urls else None
1630
1631 @classmethod
1632 def extract_id(cls, url):
1633 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1634 if mobj is None:
1635 raise ExtractorError('Invalid URL: %s' % url)
1636 video_id = mobj.group(2)
1637 return video_id
1638
1639 def _extract_chapters_from_json(self, webpage, video_id, duration):
1640 if not webpage:
1641 return
1642 initial_data = self._parse_json(
1643 self._search_regex(
1644 r'window\["ytInitialData"\] = (.+);\n', webpage,
1645 'player args', default='{}'),
1646 video_id, fatal=False)
1647 if not initial_data or not isinstance(initial_data, dict):
1648 return
1649 chapters_list = try_get(
1650 initial_data,
1651 lambda x: x['playerOverlays']
1652 ['playerOverlayRenderer']
1653 ['decoratedPlayerBarRenderer']
1654 ['decoratedPlayerBarRenderer']
1655 ['playerBar']
1656 ['chapteredPlayerBarRenderer']
1657 ['chapters'],
1658 list)
1659 if not chapters_list:
1660 return
1661
1662 def chapter_time(chapter):
1663 return float_or_none(
1664 try_get(
1665 chapter,
1666 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1667 int),
1668 scale=1000)
1669 chapters = []
1670 for next_num, chapter in enumerate(chapters_list, start=1):
1671 start_time = chapter_time(chapter)
1672 if start_time is None:
1673 continue
1674 end_time = (chapter_time(chapters_list[next_num])
1675 if next_num < len(chapters_list) else duration)
1676 if end_time is None:
1677 continue
1678 title = try_get(
1679 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1680 compat_str)
1681 chapters.append({
1682 'start_time': start_time,
1683 'end_time': end_time,
1684 'title': title,
1685 })
1686 return chapters
1687
1688 @staticmethod
1689 def _extract_chapters_from_description(description, duration):
1690 if not description:
1691 return None
1692 chapter_lines = re.findall(
1693 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1694 description)
1695 if not chapter_lines:
1696 return None
1697 chapters = []
1698 for next_num, (chapter_line, time_point) in enumerate(
1699 chapter_lines, start=1):
1700 start_time = parse_duration(time_point)
1701 if start_time is None:
1702 continue
1703 if start_time > duration:
1704 break
1705 end_time = (duration if next_num == len(chapter_lines)
1706 else parse_duration(chapter_lines[next_num][1]))
1707 if end_time is None:
1708 continue
1709 if end_time > duration:
1710 end_time = duration
1711 if start_time > end_time:
1712 break
1713 chapter_title = re.sub(
1714 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1715 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1716 chapters.append({
1717 'start_time': start_time,
1718 'end_time': end_time,
1719 'title': chapter_title,
1720 })
1721 return chapters
1722
1723 def _extract_chapters(self, webpage, description, video_id, duration):
1724 return (self._extract_chapters_from_json(webpage, video_id, duration)
1725 or self._extract_chapters_from_description(description, duration))
1726
1727 def _real_extract(self, url):
1728 url, smuggled_data = unsmuggle_url(url, {})
1729
1730 proto = (
1731 'http' if self._downloader.params.get('prefer_insecure', False)
1732 else 'https')
1733
1734 start_time = None
1735 end_time = None
1736 parsed_url = compat_urllib_parse_urlparse(url)
1737 for component in [parsed_url.fragment, parsed_url.query]:
1738 query = compat_parse_qs(component)
1739 if start_time is None and 't' in query:
1740 start_time = parse_duration(query['t'][0])
1741 if start_time is None and 'start' in query:
1742 start_time = parse_duration(query['start'][0])
1743 if end_time is None and 'end' in query:
1744 end_time = parse_duration(query['end'][0])
1745
1746 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1747 mobj = re.search(self._NEXT_URL_RE, url)
1748 if mobj:
1749 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
1750 video_id = self.extract_id(url)
1751
1752 # Get video webpage
1753 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
1754 video_webpage, urlh = self._download_webpage_handle(url, video_id)
1755
1756 qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query)
1757 video_id = qs.get('v', [None])[0] or video_id
1758
1759 # Attempt to extract SWF player URL
1760 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1761 if mobj is not None:
1762 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1763 else:
1764 player_url = None
1765
1766 dash_mpds = []
1767
1768 def add_dash_mpd(video_info):
1769 dash_mpd = video_info.get('dashmpd')
1770 if dash_mpd and dash_mpd[0] not in dash_mpds:
1771 dash_mpds.append(dash_mpd[0])
1772
1773 def add_dash_mpd_pr(pl_response):
1774 dash_mpd = url_or_none(try_get(
1775 pl_response, lambda x: x['streamingData']['dashManifestUrl'],
1776 compat_str))
1777 if dash_mpd and dash_mpd not in dash_mpds:
1778 dash_mpds.append(dash_mpd)
1779
1780 is_live = None
1781 view_count = None
1782
1783 def extract_view_count(v_info):
1784 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1785
1786 def extract_player_response(player_response, video_id):
1787 pl_response = str_or_none(player_response)
1788 if not pl_response:
1789 return
1790 pl_response = self._parse_json(pl_response, video_id, fatal=False)
1791 if isinstance(pl_response, dict):
1792 add_dash_mpd_pr(pl_response)
1793 return pl_response
1794
1795 def extract_embedded_config(embed_webpage, video_id):
1796 embedded_config = self._search_regex(
1797 r'setConfig\(({.*})\);',
1798 embed_webpage, 'ytInitialData', default=None)
1799 if embedded_config:
1800 return embedded_config
1801
1802 player_response = {}
1803
1804 # Get video info
1805 video_info = {}
1806 embed_webpage = None
1807 if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+'
1808 or re.search(r'player-age-gate-content">', video_webpage) is not None):
1809 cookie_keys = self._get_cookies('https://www.youtube.com').keys()
1810 age_gate = True
1811 # We simulate the access to the video from www.youtube.com/v/{video_id}
1812 # this can be viewed without login into Youtube
1813 url = proto + '://www.youtube.com/embed/%s' % video_id
1814 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
1815 ext = extract_embedded_config(embed_webpage, video_id)
1816 # playabilityStatus = re.search(r'{\\\"status\\\":\\\"(?P<playabilityStatus>[^\"]+)\\\"', ext)
1817 playable_in_embed = re.search(r'{\\\"playableInEmbed\\\":(?P<playableinEmbed>[^\,]+)', ext)
1818 if not playable_in_embed:
1819 self.to_screen('Could not determine whether playabale in embed for video %s' % video_id)
1820 playable_in_embed = ''
1821 else:
1822 playable_in_embed = playable_in_embed.group('playableinEmbed')
1823 # check if video is only playable on youtube in other words not playable in embed - if so it requires auth (cookies)
1824 # if re.search(r'player-unavailable">', embed_webpage) is not None:
1825 if playable_in_embed == 'false':
1826 '''
1827 # TODO apply this patch when Support for Python 2.6(!) and above drops
1828 if ({'VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID'} <= cookie_keys
1829 or {'VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO'} <= cookie_keys):
1830 '''
1831 if (set(('VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID')) <= set(cookie_keys)
1832 or set(('VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO')) <= set(cookie_keys)):
1833 age_gate = False
1834 # Try looking directly into the video webpage
1835 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1836 if ytplayer_config:
1837 args = ytplayer_config.get("args")
1838 if args is not None:
1839 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1840 # Convert to the same format returned by compat_parse_qs
1841 video_info = dict((k, [v]) for k, v in args.items())
1842 add_dash_mpd(video_info)
1843 # Rental video is not rented but preview is available (e.g.
1844 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1845 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1846 if not video_info and args.get('ypc_vid'):
1847 return self.url_result(
1848 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1849 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1850 is_live = True
1851 if not player_response:
1852 player_response = extract_player_response(args.get('player_response'), video_id)
1853 elif not player_response:
1854 player_response = ytplayer_config
1855 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1856 add_dash_mpd_pr(player_response)
1857 else:
1858 raise ExtractorError('Video is age restricted and only playable on Youtube. Requires cookies!', expected=True)
1859 else:
1860 data = compat_urllib_parse_urlencode({
1861 'video_id': video_id,
1862 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1863 'sts': self._search_regex(
1864 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1865 })
1866 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1867 try:
1868 video_info_webpage = self._download_webpage(
1869 video_info_url, video_id,
1870 note='Refetching age-gated info webpage',
1871 errnote='unable to download video info webpage')
1872 except ExtractorError:
1873 video_info_webpage = None
1874 if video_info_webpage:
1875 video_info = compat_parse_qs(video_info_webpage)
1876 pl_response = video_info.get('player_response', [None])[0]
1877 player_response = extract_player_response(pl_response, video_id)
1878 add_dash_mpd(video_info)
1879 view_count = extract_view_count(video_info)
1880 else:
1881 age_gate = False
1882 # Try looking directly into the video webpage
1883 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1884 args = ytplayer_config.get("args")
1885 if args is not None:
1886 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1887 # Convert to the same format returned by compat_parse_qs
1888 video_info = dict((k, [v]) for k, v in args.items())
1889 add_dash_mpd(video_info)
1890 # Rental video is not rented but preview is available (e.g.
1891 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1892 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1893 if not video_info and args.get('ypc_vid'):
1894 return self.url_result(
1895 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1896 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1897 is_live = True
1898 if not player_response:
1899 player_response = extract_player_response(args.get('player_response'), video_id)
1900 elif not player_response:
1901 player_response = ytplayer_config
1902 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1903 add_dash_mpd_pr(player_response)
1904
1905 def extract_unavailable_message():
1906 messages = []
1907 for tag, kind in (('h1', 'message'), ('div', 'submessage')):
1908 msg = self._html_search_regex(
1909 r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
1910 video_webpage, 'unavailable %s' % kind, default=None)
1911 if msg:
1912 messages.append(msg)
1913 if messages:
1914 return '\n'.join(messages)
1915
1916 if not video_info and not player_response:
1917 unavailable_message = extract_unavailable_message()
1918 if not unavailable_message:
1919 unavailable_message = 'Unable to extract video data'
1920 raise ExtractorError(
1921 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
1922
1923 if not isinstance(video_info, dict):
1924 video_info = {}
1925
1926 video_details = try_get(
1927 player_response, lambda x: x['videoDetails'], dict) or {}
1928
1929 microformat = try_get(
1930 player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}
1931
1932 video_title = video_info.get('title', [None])[0] or video_details.get('title')
1933 if not video_title:
1934 self._downloader.report_warning('Unable to extract video title')
1935 video_title = '_'
1936
1937 description_original = video_description = get_element_by_id("eow-description", video_webpage)
1938 if video_description:
1939
1940 def replace_url(m):
1941 redir_url = compat_urlparse.urljoin(url, m.group(1))
1942 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1943 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1944 qs = compat_parse_qs(parsed_redir_url.query)
1945 q = qs.get('q')
1946 if q and q[0]:
1947 return q[0]
1948 return redir_url
1949
1950 description_original = video_description = re.sub(r'''(?x)
1951 <a\s+
1952 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1953 (?:title|href)="([^"]+)"\s+
1954 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1955 class="[^"]*"[^>]*>
1956 [^<]+\.{3}\s*
1957 </a>
1958 ''', replace_url, video_description)
1959 video_description = clean_html(video_description)
1960 else:
1961 video_description = video_details.get('shortDescription')
1962 if video_description is None:
1963 video_description = self._html_search_meta('description', video_webpage)
1964
1965 if not smuggled_data.get('force_singlefeed', False):
1966 if not self._downloader.params.get('noplaylist'):
1967 multifeed_metadata_list = try_get(
1968 player_response,
1969 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
1970 compat_str) or try_get(
1971 video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
1972 if multifeed_metadata_list:
1973 entries = []
1974 feed_ids = []
1975 for feed in multifeed_metadata_list.split(','):
1976 # Unquote should take place before split on comma (,) since textual
1977 # fields may contain comma as well (see
1978 # https://github.com/ytdl-org/youtube-dl/issues/8536)
1979 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
1980
1981 def feed_entry(name):
1982 return try_get(feed_data, lambda x: x[name][0], compat_str)
1983
1984 feed_id = feed_entry('id')
1985 if not feed_id:
1986 continue
1987 feed_title = feed_entry('title')
1988 title = video_title
1989 if feed_title:
1990 title += ' (%s)' % feed_title
1991 entries.append({
1992 '_type': 'url_transparent',
1993 'ie_key': 'Youtube',
1994 'url': smuggle_url(
1995 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1996 {'force_singlefeed': True}),
1997 'title': title,
1998 })
1999 feed_ids.append(feed_id)
2000 self.to_screen(
2001 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
2002 % (', '.join(feed_ids), video_id))
2003 return self.playlist_result(entries, video_id, video_title, video_description)
2004 else:
2005 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
2006
2007 if view_count is None:
2008 view_count = extract_view_count(video_info)
2009 if view_count is None and video_details:
2010 view_count = int_or_none(video_details.get('viewCount'))
2011 if view_count is None and microformat:
2012 view_count = int_or_none(microformat.get('viewCount'))
2013
2014 if is_live is None:
2015 is_live = bool_or_none(video_details.get('isLive'))
2016
2017 has_live_chat_replay = False
2018 if not is_live:
2019 yt_initial_data = self._get_yt_initial_data(video_id, video_webpage)
2020 try:
2021 yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
2022 has_live_chat_replay = True
2023 except (KeyError, IndexError, TypeError):
2024 pass
2025
2026 # Check for "rental" videos
2027 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
2028 raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
2029
2030 def _extract_filesize(media_url):
2031 return int_or_none(self._search_regex(
2032 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
2033
2034 streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
2035 streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
2036
2037 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
2038 self.report_rtmp_download()
2039 formats = [{
2040 'format_id': '_rtmp',
2041 'protocol': 'rtmp',
2042 'url': video_info['conn'][0],
2043 'player_url': player_url,
2044 }]
2045 elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
2046 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
2047 if 'rtmpe%3Dyes' in encoded_url_map:
2048 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
2049 formats = []
2050 formats_spec = {}
2051 fmt_list = video_info.get('fmt_list', [''])[0]
2052 if fmt_list:
2053 for fmt in fmt_list.split(','):
2054 spec = fmt.split('/')
2055 if len(spec) > 1:
2056 width_height = spec[1].split('x')
2057 if len(width_height) == 2:
2058 formats_spec[spec[0]] = {
2059 'resolution': spec[1],
2060 'width': int_or_none(width_height[0]),
2061 'height': int_or_none(width_height[1]),
2062 }
2063 for fmt in streaming_formats:
2064 itag = str_or_none(fmt.get('itag'))
2065 if not itag:
2066 continue
2067 quality = fmt.get('quality')
2068 quality_label = fmt.get('qualityLabel') or quality
2069 formats_spec[itag] = {
2070 'asr': int_or_none(fmt.get('audioSampleRate')),
2071 'filesize': int_or_none(fmt.get('contentLength')),
2072 'format_note': quality_label,
2073 'fps': int_or_none(fmt.get('fps')),
2074 'height': int_or_none(fmt.get('height')),
2075 # bitrate for itag 43 is always 2147483647
2076 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
2077 'width': int_or_none(fmt.get('width')),
2078 }
2079
2080 for fmt in streaming_formats:
2081 if fmt.get('drmFamilies') or fmt.get('drm_families'):
2082 continue
2083 url = url_or_none(fmt.get('url'))
2084
2085 if not url:
2086 cipher = fmt.get('cipher') or fmt.get('signatureCipher')
2087 if not cipher:
2088 continue
2089 url_data = compat_parse_qs(cipher)
2090 url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
2091 if not url:
2092 continue
2093 else:
2094 cipher = None
2095 url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2096
2097 stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
2098 # Unsupported FORMAT_STREAM_TYPE_OTF
2099 if stream_type == 3:
2100 continue
2101
2102 format_id = fmt.get('itag') or url_data['itag'][0]
2103 if not format_id:
2104 continue
2105 format_id = compat_str(format_id)
2106
2107 if cipher:
2108 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
2109 ASSETS_RE = r'(?:"assets":.+?"js":\s*("[^"]+"))|(?:"jsUrl":\s*("[^"]+"))'
2110 jsplayer_url_json = self._search_regex(
2111 ASSETS_RE,
2112 embed_webpage if age_gate else video_webpage,
2113 'JS player URL (1)', default=None)
2114 if not jsplayer_url_json and not age_gate:
2115 # We need the embed website after all
2116 if embed_webpage is None:
2117 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
2118 embed_webpage = self._download_webpage(
2119 embed_url, video_id, 'Downloading embed webpage')
2120 jsplayer_url_json = self._search_regex(
2121 ASSETS_RE, embed_webpage, 'JS player URL')
2122
2123 player_url = json.loads(jsplayer_url_json)
2124 if player_url is None:
2125 player_url_json = self._search_regex(
2126 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
2127 video_webpage, 'age gate player URL')
2128 player_url = json.loads(player_url_json)
2129
2130 if 'sig' in url_data:
2131 url += '&signature=' + url_data['sig'][0]
2132 elif 's' in url_data:
2133 encrypted_sig = url_data['s'][0]
2134
2135 if self._downloader.params.get('verbose'):
2136 if player_url is None:
2137 player_desc = 'unknown'
2138 else:
2139 player_type, player_version = self._extract_player_info(player_url)
2140 player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version)
2141 parts_sizes = self._signature_cache_id(encrypted_sig)
2142 self.to_screen('{%s} signature length %s, %s' %
2143 (format_id, parts_sizes, player_desc))
2144
2145 signature = self._decrypt_signature(
2146 encrypted_sig, video_id, player_url, age_gate)
2147 sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
2148 url += '&%s=%s' % (sp, signature)
2149 if 'ratebypass' not in url:
2150 url += '&ratebypass=yes'
2151
2152 dct = {
2153 'format_id': format_id,
2154 'url': url,
2155 'player_url': player_url,
2156 }
2157 if format_id in self._formats:
2158 dct.update(self._formats[format_id])
2159 if format_id in formats_spec:
2160 dct.update(formats_spec[format_id])
2161
2162 # Some itags are not included in DASH manifest thus corresponding formats will
2163 # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).
2164 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
2165 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
2166 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
2167
2168 if width is None:
2169 width = int_or_none(fmt.get('width'))
2170 if height is None:
2171 height = int_or_none(fmt.get('height'))
2172
2173 filesize = int_or_none(url_data.get(
2174 'clen', [None])[0]) or _extract_filesize(url)
2175
2176 quality = url_data.get('quality', [None])[0] or fmt.get('quality')
2177 quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
2178
2179 tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
2180 or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
2181 fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
2182
2183 more_fields = {
2184 'filesize': filesize,
2185 'tbr': tbr,
2186 'width': width,
2187 'height': height,
2188 'fps': fps,
2189 'format_note': quality_label or quality,
2190 }
2191 for key, value in more_fields.items():
2192 if value:
2193 dct[key] = value
2194 type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
2195 if type_:
2196 type_split = type_.split(';')
2197 kind_ext = type_split[0].split('/')
2198 if len(kind_ext) == 2:
2199 kind, _ = kind_ext
2200 dct['ext'] = mimetype2ext(type_split[0])
2201 if kind in ('audio', 'video'):
2202 codecs = None
2203 for mobj in re.finditer(
2204 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
2205 if mobj.group('key') == 'codecs':
2206 codecs = mobj.group('val')
2207 break
2208 if codecs:
2209 dct.update(parse_codecs(codecs))
2210 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
2211 dct['downloader_options'] = {
2212 # Youtube throttles chunks >~10M
2213 'http_chunk_size': 10485760,
2214 }
2215 formats.append(dct)
2216 else:
2217 manifest_url = (
2218 url_or_none(try_get(
2219 player_response,
2220 lambda x: x['streamingData']['hlsManifestUrl'],
2221 compat_str))
2222 or url_or_none(try_get(
2223 video_info, lambda x: x['hlsvp'][0], compat_str)))
2224 if manifest_url:
2225 formats = []
2226 m3u8_formats = self._extract_m3u8_formats(
2227 manifest_url, video_id, 'mp4', fatal=False)
2228 for a_format in m3u8_formats:
2229 itag = self._search_regex(
2230 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
2231 if itag:
2232 a_format['format_id'] = itag
2233 if itag in self._formats:
2234 dct = self._formats[itag].copy()
2235 dct.update(a_format)
2236 a_format = dct
2237 a_format['player_url'] = player_url
2238 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
2239 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
2240 if self._downloader.params.get('youtube_include_hls_manifest', True):
2241 formats.append(a_format)
2242 else:
2243 error_message = extract_unavailable_message()
2244 if not error_message:
2245 error_message = clean_html(try_get(
2246 player_response, lambda x: x['playabilityStatus']['reason'],
2247 compat_str))
2248 if not error_message:
2249 error_message = clean_html(
2250 try_get(video_info, lambda x: x['reason'][0], compat_str))
2251 if error_message:
2252 raise ExtractorError(error_message, expected=True)
2253 raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
2254
2255 # uploader
2256 video_uploader = try_get(
2257 video_info, lambda x: x['author'][0],
2258 compat_str) or str_or_none(video_details.get('author'))
2259 if video_uploader:
2260 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
2261 else:
2262 self._downloader.report_warning('unable to extract uploader name')
2263
2264 # uploader_id
2265 video_uploader_id = None
2266 video_uploader_url = None
2267 mobj = re.search(
2268 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
2269 video_webpage)
2270 if mobj is not None:
2271 video_uploader_id = mobj.group('uploader_id')
2272 video_uploader_url = mobj.group('uploader_url')
2273 else:
2274 owner_profile_url = url_or_none(microformat.get('ownerProfileUrl'))
2275 if owner_profile_url:
2276 video_uploader_id = self._search_regex(
2277 r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id',
2278 default=None)
2279 video_uploader_url = owner_profile_url
2280
2281 channel_id = (
2282 str_or_none(video_details.get('channelId'))
2283 or self._html_search_meta(
2284 'channelId', video_webpage, 'channel id', default=None)
2285 or self._search_regex(
2286 r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
2287 video_webpage, 'channel id', default=None, group='id'))
2288 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
2289
2290 thumbnails = []
2291 thumbnails_list = try_get(
2292 video_details, lambda x: x['thumbnail']['thumbnails'], list) or []
2293 for t in thumbnails_list:
2294 if not isinstance(t, dict):
2295 continue
2296 thumbnail_url = url_or_none(t.get('url'))
2297 if not thumbnail_url:
2298 continue
2299 thumbnails.append({
2300 'url': thumbnail_url,
2301 'width': int_or_none(t.get('width')),
2302 'height': int_or_none(t.get('height')),
2303 })
2304
2305 if not thumbnails:
2306 video_thumbnail = None
2307 # We try first to get a high quality image:
2308 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
2309 video_webpage, re.DOTALL)
2310 if m_thumb is not None:
2311 video_thumbnail = m_thumb.group(1)
2312 thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str)
2313 if thumbnail_url:
2314 video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url)
2315 if video_thumbnail:
2316 thumbnails.append({'url': video_thumbnail})
2317
2318 # upload date
2319 upload_date = self._html_search_meta(
2320 'datePublished', video_webpage, 'upload date', default=None)
2321 if not upload_date:
2322 upload_date = self._search_regex(
2323 [r'(?s)id="eow-date.*?>(.*?)</span>',
2324 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
2325 video_webpage, 'upload date', default=None)
2326 if not upload_date:
2327 upload_date = microformat.get('publishDate') or microformat.get('uploadDate')
2328 upload_date = unified_strdate(upload_date)
2329
2330 video_license = self._html_search_regex(
2331 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
2332 video_webpage, 'license', default=None)
2333
2334 m_music = re.search(
2335 r'''(?x)
2336 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
2337 <ul[^>]*>\s*
2338 <li>(?P<title>.+?)
2339 by (?P<creator>.+?)
2340 (?:
2341 \(.+?\)|
2342 <a[^>]*
2343 (?:
2344 \bhref=["\']/red[^>]*>| # drop possible
2345 >\s*Listen ad-free with YouTube Red # YouTube Red ad
2346 )
2347 .*?
2348 )?</li
2349 ''',
2350 video_webpage)
2351 if m_music:
2352 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
2353 video_creator = clean_html(m_music.group('creator'))
2354 else:
2355 video_alt_title = video_creator = None
2356
2357 def extract_meta(field):
2358 return self._html_search_regex(
2359 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
2360 video_webpage, field, default=None)
2361
2362 track = extract_meta('Song')
2363 artist = extract_meta('Artist')
2364 album = extract_meta('Album')
2365
2366 # Youtube Music Auto-generated description
2367 release_date = release_year = None
2368 if video_description:
2369 mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description)
2370 if mobj:
2371 if not track:
2372 track = mobj.group('track').strip()
2373 if not artist:
2374 artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))
2375 if not album:
2376 album = mobj.group('album'.strip())
2377 release_year = mobj.group('release_year')
2378 release_date = mobj.group('release_date')
2379 if release_date:
2380 release_date = release_date.replace('-', '')
2381 if not release_year:
2382 release_year = int(release_date[:4])
2383 if release_year:
2384 release_year = int(release_year)
2385
2386 yt_initial = self._get_yt_initial_data(video_id, video_webpage)
2387 if yt_initial:
2388 music_metadata = self._get_music_metadata_from_yt_initial(yt_initial)
2389 if len(music_metadata):
2390 album = music_metadata[0].get('album')
2391 artist = music_metadata[0].get('artist')
2392 track = music_metadata[0].get('track')
2393
2394 m_episode = re.search(
2395 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
2396 video_webpage)
2397 if m_episode:
2398 series = unescapeHTML(m_episode.group('series'))
2399 season_number = int(m_episode.group('season'))
2400 episode_number = int(m_episode.group('episode'))
2401 else:
2402 series = season_number = episode_number = None
2403
2404 m_cat_container = self._search_regex(
2405 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
2406 video_webpage, 'categories', default=None)
2407 category = None
2408 if m_cat_container:
2409 category = self._html_search_regex(
2410 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
2411 default=None)
2412 if not category:
2413 category = try_get(
2414 microformat, lambda x: x['category'], compat_str)
2415 video_categories = None if category is None else [category]
2416
2417 video_tags = [
2418 unescapeHTML(m.group('content'))
2419 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
2420 if not video_tags:
2421 video_tags = try_get(video_details, lambda x: x['keywords'], list)
2422
2423 def _extract_count(count_name):
2424 return str_to_int(self._search_regex(
2425 r'"accessibilityData":\{"label":"([\d,\w]+) %ss"\}'
2426 % re.escape(count_name),
2427 video_webpage, count_name, default=None))
2428
2429 like_count = _extract_count('like')
2430 dislike_count = _extract_count('dislike')
2431
2432 if view_count is None:
2433 view_count = str_to_int(self._search_regex(
2434 r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
2435 'view count', default=None))
2436
2437 average_rating = (
2438 float_or_none(video_details.get('averageRating'))
2439 or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
2440
2441 # subtitles
2442 video_subtitles = self.extract_subtitles(
2443 video_id, video_webpage, has_live_chat_replay)
2444 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
2445
2446 video_duration = try_get(
2447 video_info, lambda x: int_or_none(x['length_seconds'][0]))
2448 if not video_duration:
2449 video_duration = int_or_none(video_details.get('lengthSeconds'))
2450 if not video_duration:
2451 video_duration = parse_duration(self._html_search_meta(
2452 'duration', video_webpage, 'video duration'))
2453
2454 # Get Subscriber Count of channel
2455 subscriber_count = parse_count(self._search_regex(
2456 r'"text":"([\d\.]+\w?) subscribers"',
2457 video_webpage,
2458 'subscriber count',
2459 default=None
2460 ))
2461
2462 # annotations
2463 video_annotations = None
2464 if self._downloader.params.get('writeannotations', False):
2465 xsrf_token = self._search_regex(
2466 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2',
2467 video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
2468 invideo_url = try_get(
2469 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
2470 if xsrf_token and invideo_url:
2471 xsrf_field_name = self._search_regex(
2472 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
2473 video_webpage, 'xsrf field name',
2474 group='xsrf_field_name', default='session_token')
2475 video_annotations = self._download_webpage(
2476 self._proto_relative_url(invideo_url),
2477 video_id, note='Downloading annotations',
2478 errnote='Unable to download video annotations', fatal=False,
2479 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
2480
2481 chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
2482
2483 # Look for the DASH manifest
2484 if self._downloader.params.get('youtube_include_dash_manifest', True):
2485 dash_mpd_fatal = True
2486 for mpd_url in dash_mpds:
2487 dash_formats = {}
2488 try:
2489 def decrypt_sig(mobj):
2490 s = mobj.group(1)
2491 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2492 return '/signature/%s' % dec_s
2493
2494 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2495
2496 for df in self._extract_mpd_formats(
2497 mpd_url, video_id, fatal=dash_mpd_fatal,
2498 formats_dict=self._formats):
2499 if not df.get('filesize'):
2500 df['filesize'] = _extract_filesize(df['url'])
2501 # Do not overwrite DASH format found in some previous DASH manifest
2502 if df['format_id'] not in dash_formats:
2503 dash_formats[df['format_id']] = df
2504 # Additional DASH manifests may end up in HTTP Error 403 therefore
2505 # allow them to fail without bug report message if we already have
2506 # some DASH manifest succeeded. This is temporary workaround to reduce
2507 # burst of bug reports until we figure out the reason and whether it
2508 # can be fixed at all.
2509 dash_mpd_fatal = False
2510 except (ExtractorError, KeyError) as e:
2511 self.report_warning(
2512 'Skipping DASH manifest: %r' % e, video_id)
2513 if dash_formats:
2514 # Remove the formats we found through non-DASH, they
2515 # contain less info and it can be wrong, because we use
2516 # fixed values (for example the resolution). See
2517 # https://github.com/ytdl-org/youtube-dl/issues/5774 for an
2518 # example.
2519 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
2520 formats.extend(dash_formats.values())
2521
2522 # Check for malformed aspect ratio
2523 stretched_m = re.search(
2524 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2525 video_webpage)
2526 if stretched_m:
2527 w = float(stretched_m.group('w'))
2528 h = float(stretched_m.group('h'))
2529 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2530 # We will only process correct ratios.
2531 if w > 0 and h > 0:
2532 ratio = w / h
2533 for f in formats:
2534 if f.get('vcodec') != 'none':
2535 f['stretched_ratio'] = ratio
2536
2537 if not formats:
2538 if 'reason' in video_info:
2539 if 'The uploader has not made this video available in your country.' in video_info['reason']:
2540 regions_allowed = self._html_search_meta(
2541 'regionsAllowed', video_webpage, default=None)
2542 countries = regions_allowed.split(',') if regions_allowed else None
2543 self.raise_geo_restricted(
2544 msg=video_info['reason'][0], countries=countries)
2545 reason = video_info['reason'][0]
2546 if 'Invalid parameters' in reason:
2547 unavailable_message = extract_unavailable_message()
2548 if unavailable_message:
2549 reason = unavailable_message
2550 raise ExtractorError(
2551 'YouTube said: %s' % reason,
2552 expected=True, video_id=video_id)
2553 if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']):
2554 raise ExtractorError('This video is DRM protected.', expected=True)
2555
2556 self._sort_formats(formats)
2557
2558 self.mark_watched(video_id, video_info, player_response)
2559
2560 return {
2561 'id': video_id,
2562 'uploader': video_uploader,
2563 'uploader_id': video_uploader_id,
2564 'uploader_url': video_uploader_url,
2565 'channel_id': channel_id,
2566 'channel_url': channel_url,
2567 'upload_date': upload_date,
2568 'license': video_license,
2569 'creator': video_creator or artist,
2570 'title': video_title,
2571 'alt_title': video_alt_title or track,
2572 'thumbnails': thumbnails,
2573 'description': video_description,
2574 'categories': video_categories,
2575 'tags': video_tags,
2576 'subtitles': video_subtitles,
2577 'automatic_captions': automatic_captions,
2578 'duration': video_duration,
2579 'age_limit': 18 if age_gate else 0,
2580 'annotations': video_annotations,
2581 'chapters': chapters,
2582 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
2583 'view_count': view_count,
2584 'like_count': like_count,
2585 'dislike_count': dislike_count,
2586 'average_rating': average_rating,
2587 'formats': formats,
2588 'is_live': is_live,
2589 'start_time': start_time,
2590 'end_time': end_time,
2591 'series': series,
2592 'season_number': season_number,
2593 'episode_number': episode_number,
2594 'track': track,
2595 'artist': artist,
2596 'album': album,
2597 'release_date': release_date,
2598 'release_year': release_year,
2599 'subscriber_count': subscriber_count,
2600 }
2601
2602
2603 class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
2604 IE_DESC = 'YouTube.com playlists'
2605 _VALID_URL = r"""(?x)(?:
2606 (?:https?://)?
2607 (?:\w+\.)?
2608 (?:
2609 (?:
2610 youtube(?:kids)?\.com|
2611 invidio\.us
2612 )
2613 /
2614 (?:
2615 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))
2616 \? (?:.*?[&;])*? (?:p|a|list)=
2617 | p/
2618 )|
2619 youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
2620 )
2621 (
2622 (?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)?[0-9A-Za-z-_]{10,}
2623 # Top tracks, they can also include dots
2624 |(?:MC)[\w\.]*
2625 )
2626 .*
2627 |
2628 (%(playlist_id)s)
2629 )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
2630 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
2631 _VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&amp;(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?'
2632 _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})'
2633 IE_NAME = 'youtube:playlist'
2634 _TESTS = [{
2635 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2636 'info_dict': {
2637 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2638 'uploader': 'Sergey M.',
2639 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2640 'title': 'youtube-dl public playlist',
2641 },
2642 'playlist_count': 1,
2643 }, {
2644 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2645 'info_dict': {
2646 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2647 'uploader': 'Sergey M.',
2648 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2649 'title': 'youtube-dl empty playlist',
2650 },
2651 'playlist_count': 0,
2652 }, {
2653 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2654 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2655 'info_dict': {
2656 'title': '29C3: Not my department',
2657 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2658 'uploader': 'Christiaan008',
2659 'uploader_id': 'ChRiStIaAn008',
2660 },
2661 'playlist_count': 96,
2662 }, {
2663 'note': 'issue #673',
2664 'url': 'PLBB231211A4F62143',
2665 'info_dict': {
2666 'title': '[OLD]Team Fortress 2 (Class-based LP)',
2667 'id': 'PLBB231211A4F62143',
2668 'uploader': 'Wickydoo',
2669 'uploader_id': 'Wickydoo',
2670 },
2671 'playlist_mincount': 26,
2672 }, {
2673 'note': 'Large playlist',
2674 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
2675 'info_dict': {
2676 'title': 'Uploads from Cauchemar',
2677 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
2678 'uploader': 'Cauchemar',
2679 'uploader_id': 'Cauchemar89',
2680 },
2681 'playlist_mincount': 799,
2682 }, {
2683 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2684 'info_dict': {
2685 'title': 'YDL_safe_search',
2686 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2687 },
2688 'playlist_count': 2,
2689 'skip': 'This playlist is private',
2690 }, {
2691 'note': 'embedded',
2692 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
2693 'playlist_count': 4,
2694 'info_dict': {
2695 'title': 'JODA15',
2696 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
2697 'uploader': 'milan',
2698 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
2699 }
2700 }, {
2701 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2702 'playlist_mincount': 485,
2703 'info_dict': {
2704 'title': '2018 Chinese New Singles (11/6 updated)',
2705 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2706 'uploader': 'LBK',
2707 'uploader_id': 'sdragonfang',
2708 }
2709 }, {
2710 'note': 'Embedded SWF player',
2711 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
2712 'playlist_count': 4,
2713 'info_dict': {
2714 'title': 'JODA7',
2715 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
2716 },
2717 'skip': 'This playlist does not exist',
2718 }, {
2719 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2720 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2721 'info_dict': {
2722 'title': 'Uploads from Interstellar Movie',
2723 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
2724 'uploader': 'Interstellar Movie',
2725 'uploader_id': 'InterstellarMovie1',
2726 },
2727 'playlist_mincount': 21,
2728 }, {
2729 # Playlist URL that does not actually serve a playlist
2730 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2731 'info_dict': {
2732 'id': 'FqZTN594JQw',
2733 'ext': 'webm',
2734 'title': "Smiley's People 01 detective, Adventure Series, Action",
2735 'uploader': 'STREEM',
2736 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
2737 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
2738 'upload_date': '20150526',
2739 'license': 'Standard YouTube License',
2740 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2741 'categories': ['People & Blogs'],
2742 'tags': list,
2743 'view_count': int,
2744 'like_count': int,
2745 'dislike_count': int,
2746 },
2747 'params': {
2748 'skip_download': True,
2749 },
2750 'skip': 'This video is not available.',
2751 'add_ie': [YoutubeIE.ie_key()],
2752 }, {
2753 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
2754 'info_dict': {
2755 'id': 'yeWKywCrFtk',
2756 'ext': 'mp4',
2757 'title': 'Small Scale Baler and Braiding Rugs',
2758 'uploader': 'Backus-Page House Museum',
2759 'uploader_id': 'backuspagemuseum',
2760 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
2761 'upload_date': '20161008',
2762 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
2763 'categories': ['Nonprofits & Activism'],
2764 'tags': list,
2765 'like_count': int,
2766 'dislike_count': int,
2767 },
2768 'params': {
2769 'noplaylist': True,
2770 'skip_download': True,
2771 },
2772 }, {
2773 # https://github.com/ytdl-org/youtube-dl/issues/21844
2774 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2775 'info_dict': {
2776 'title': 'Data Analysis with Dr Mike Pound',
2777 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2778 'uploader_id': 'Computerphile',
2779 'uploader': 'Computerphile',
2780 },
2781 'playlist_mincount': 11,
2782 }, {
2783 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
2784 'only_matching': True,
2785 }, {
2786 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
2787 'only_matching': True,
2788 }, {
2789 # music album playlist
2790 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
2791 'only_matching': True,
2792 }, {
2793 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',
2794 'only_matching': True,
2795 }, {
2796 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
2797 'only_matching': True,
2798 }]
2799
2800 def _real_initialize(self):
2801 self._login()
2802
2803 def extract_videos_from_page(self, page):
2804 ids_in_page = []
2805 titles_in_page = []
2806
2807 for item in re.findall(
2808 r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page):
2809 attrs = extract_attributes(item)
2810 video_id = attrs['data-video-id']
2811 video_title = unescapeHTML(attrs.get('data-title'))
2812 if video_title:
2813 video_title = video_title.strip()
2814 ids_in_page.append(video_id)
2815 titles_in_page.append(video_title)
2816
2817 # Fallback with old _VIDEO_RE
2818 self.extract_videos_from_page_impl(
2819 self._VIDEO_RE, page, ids_in_page, titles_in_page)
2820
2821 # Relaxed fallbacks
2822 self.extract_videos_from_page_impl(
2823 r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page,
2824 ids_in_page, titles_in_page)
2825 self.extract_videos_from_page_impl(
2826 r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page,
2827 ids_in_page, titles_in_page)
2828
2829 return zip(ids_in_page, titles_in_page)
2830
2831 def _extract_mix(self, playlist_id):
2832 # The mixes are generated from a single video
2833 # the id of the playlist is just 'RD' + video_id
2834 ids = []
2835 last_id = playlist_id[-11:]
2836 for n in itertools.count(1):
2837 url = 'https://www.youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
2838 webpage = self._download_webpage(
2839 url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))
2840 new_ids = orderedSet(re.findall(
2841 r'''(?xs)data-video-username=".*?".*?
2842 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
2843 webpage))
2844 # Fetch new pages until all the videos are repeated, it seems that
2845 # there are always 51 unique videos.
2846 new_ids = [_id for _id in new_ids if _id not in ids]
2847 if not new_ids:
2848 break
2849 ids.extend(new_ids)
2850 last_id = ids[-1]
2851
2852 url_results = self._ids_to_results(ids)
2853
2854 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
2855 title_span = (
2856 search_title('playlist-title')
2857 or search_title('title long-title')
2858 or search_title('title'))
2859 title = clean_html(title_span)
2860
2861 return self.playlist_result(url_results, playlist_id, title)
2862
2863 def _extract_playlist(self, playlist_id):
2864 url = self._TEMPLATE_URL % playlist_id
2865 page = self._download_webpage(url, playlist_id)
2866
2867 # the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604)
2868 for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page):
2869 match = match.strip()
2870 # Check if the playlist exists or is private
2871 mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match)
2872 if mobj:
2873 reason = mobj.group('reason')
2874 message = 'This playlist %s' % reason
2875 if 'private' in reason:
2876 message += ', use --username or --netrc to access it'
2877 message += '.'
2878 raise ExtractorError(message, expected=True)
2879 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
2880 raise ExtractorError(
2881 'Invalid parameters. Maybe URL is incorrect.',
2882 expected=True)
2883 elif re.match(r'[^<]*Choose your language[^<]*', match):
2884 continue
2885 else:
2886 self.report_warning('Youtube gives an alert message: ' + match)
2887
2888 playlist_title = self._html_search_regex(
2889 r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
2890 page, 'title', default=None)
2891
2892 _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref='
2893 uploader = self._html_search_regex(
2894 r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,
2895 page, 'uploader', default=None)
2896 mobj = re.search(
2897 r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE,
2898 page)
2899 if mobj:
2900 uploader_id = mobj.group('uploader_id')
2901 uploader_url = compat_urlparse.urljoin(url, mobj.group('path'))
2902 else:
2903 uploader_id = uploader_url = None
2904
2905 has_videos = True
2906
2907 if not playlist_title:
2908 try:
2909 # Some playlist URLs don't actually serve a playlist (e.g.
2910 # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)
2911 next(self._entries(page, playlist_id))
2912 except StopIteration:
2913 has_videos = False
2914
2915 playlist = self.playlist_result(
2916 self._entries(page, playlist_id), playlist_id, playlist_title)
2917 playlist.update({
2918 'uploader': uploader,
2919 'uploader_id': uploader_id,
2920 'uploader_url': uploader_url,
2921 })
2922
2923 return has_videos, playlist
2924
2925 def _check_download_just_video(self, url, playlist_id):
2926 # Check if it's a video-specific URL
2927 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
2928 video_id = query_dict.get('v', [None])[0] or self._search_regex(
2929 r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url,
2930 'video id', default=None)
2931 if video_id:
2932 if self._downloader.params.get('noplaylist'):
2933 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
2934 return video_id, self.url_result(video_id, 'Youtube', video_id=video_id)
2935 else:
2936 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
2937 return video_id, None
2938 return None, None
2939
2940 def _real_extract(self, url):
2941 # Extract playlist id
2942 mobj = re.match(self._VALID_URL, url)
2943 if mobj is None:
2944 raise ExtractorError('Invalid URL: %s' % url)
2945 playlist_id = mobj.group(1) or mobj.group(2)
2946
2947 video_id, video = self._check_download_just_video(url, playlist_id)
2948 if video:
2949 return video
2950
2951 if playlist_id.startswith(('RD', 'UL', 'PU')):
2952 # Mixes require a custom extraction process
2953 return self._extract_mix(playlist_id)
2954
2955 has_videos, playlist = self._extract_playlist(playlist_id)
2956 if has_videos or not video_id:
2957 return playlist
2958
2959 # Some playlist URLs don't actually serve a playlist (see
2960 # https://github.com/ytdl-org/youtube-dl/issues/10537).
2961 # Fallback to plain video extraction if there is a video id
2962 # along with playlist id.
2963 return self.url_result(video_id, 'Youtube', video_id=video_id)
2964
2965
2966 class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
2967 IE_DESC = 'YouTube.com channels'
2968 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie|kids)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)'
2969 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
2970 _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
2971 IE_NAME = 'youtube:channel'
2972 _TESTS = [{
2973 'note': 'paginated channel',
2974 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
2975 'playlist_mincount': 91,
2976 'info_dict': {
2977 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
2978 'title': 'Uploads from lex will',
2979 'uploader': 'lex will',
2980 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2981 }
2982 }, {
2983 'note': 'Age restricted channel',
2984 # from https://www.youtube.com/user/DeusExOfficial
2985 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',
2986 'playlist_mincount': 64,
2987 'info_dict': {
2988 'id': 'UUs0ifCMCm1icqRbqhUINa0w',
2989 'title': 'Uploads from Deus Ex',
2990 'uploader': 'Deus Ex',
2991 'uploader_id': 'DeusExOfficial',
2992 },
2993 }, {
2994 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',
2995 'only_matching': True,
2996 }, {
2997 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA',
2998 'only_matching': True,
2999 }]
3000
3001 @classmethod
3002 def suitable(cls, url):
3003 return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)
3004 else super(YoutubeChannelIE, cls).suitable(url))
3005
3006 def _build_template_url(self, url, channel_id):
3007 return self._TEMPLATE_URL % channel_id
3008
3009 def _real_extract(self, url):
3010 channel_id = self._match_id(url)
3011
3012 url = self._build_template_url(url, channel_id)
3013
3014 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
3015 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
3016 # otherwise fallback on channel by page extraction
3017 channel_page = self._download_webpage(
3018 url + '?view=57', channel_id,
3019 'Downloading channel page', fatal=False)
3020 if channel_page is False:
3021 channel_playlist_id = False
3022 else:
3023 channel_playlist_id = self._html_search_meta(
3024 'channelId', channel_page, 'channel id', default=None)
3025 if not channel_playlist_id:
3026 channel_url = self._html_search_meta(
3027 ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),
3028 channel_page, 'channel url', default=None)
3029 if channel_url:
3030 channel_playlist_id = self._search_regex(
3031 r'vnd\.youtube://user/([0-9A-Za-z_-]+)',
3032 channel_url, 'channel id', default=None)
3033 if channel_playlist_id and channel_playlist_id.startswith('UC'):
3034 playlist_id = 'UU' + channel_playlist_id[2:]
3035 return self.url_result(
3036 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
3037
3038 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
3039 autogenerated = re.search(r'''(?x)
3040 class="[^"]*?(?:
3041 channel-header-autogenerated-label|
3042 yt-channel-title-autogenerated
3043 )[^"]*"''', channel_page) is not None
3044
3045 if autogenerated:
3046 # The videos are contained in a single page
3047 # the ajax pages can't be used, they are empty
3048 entries = [
3049 self.url_result(
3050 video_id, 'Youtube', video_id=video_id,
3051 video_title=video_title)
3052 for video_id, video_title in self.extract_videos_from_page(channel_page)]
3053 return self.playlist_result(entries, channel_id)
3054
3055 try:
3056 next(self._entries(channel_page, channel_id))
3057 except StopIteration:
3058 alert_message = self._html_search_regex(
3059 r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',
3060 channel_page, 'alert', default=None, group='alert')
3061 if alert_message:
3062 raise ExtractorError('Youtube said: %s' % alert_message, expected=True)
3063
3064 return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
3065
3066
3067 class YoutubeUserIE(YoutubeChannelIE):
3068 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
3069 _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9%-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_%-]+)'
3070 _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
3071 IE_NAME = 'youtube:user'
3072
3073 _TESTS = [{
3074 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
3075 'playlist_mincount': 320,
3076 'info_dict': {
3077 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
3078 'title': 'Uploads from The Linux Foundation',
3079 'uploader': 'The Linux Foundation',
3080 'uploader_id': 'TheLinuxFoundation',
3081 }
3082 }, {
3083 # Only available via https://www.youtube.com/c/12minuteathlete/videos
3084 # but not https://www.youtube.com/user/12minuteathlete/videos
3085 'url': 'https://www.youtube.com/c/12minuteathlete/videos',
3086 'playlist_mincount': 249,
3087 'info_dict': {
3088 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
3089 'title': 'Uploads from 12 Minute Athlete',
3090 'uploader': '12 Minute Athlete',
3091 'uploader_id': 'the12minuteathlete',
3092 }
3093 }, {
3094 'url': 'ytuser:phihag',
3095 'only_matching': True,
3096 }, {
3097 'url': 'https://www.youtube.com/c/gametrailers',
3098 'only_matching': True,
3099 }, {
3100 'url': 'https://www.youtube.com/c/Pawe%C5%82Zadro%C5%BCniak',
3101 'only_matching': True,
3102 }, {
3103 'url': 'https://www.youtube.com/gametrailers',
3104 'only_matching': True,
3105 }, {
3106 # This channel is not available, geo restricted to JP
3107 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',
3108 'only_matching': True,
3109 }]
3110
3111 @classmethod
3112 def suitable(cls, url):
3113 # Don't return True if the url can be extracted with other youtube
3114 # extractor, the regex would is too permissive and it would match.
3115 other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)
3116 if any(ie.suitable(url) for ie in other_yt_ies):
3117 return False
3118 else:
3119 return super(YoutubeUserIE, cls).suitable(url)
3120
3121 def _build_template_url(self, url, channel_id):
3122 mobj = re.match(self._VALID_URL, url)
3123 return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))
3124
3125
3126 class YoutubeLiveIE(YoutubeBaseInfoExtractor):
3127 IE_DESC = 'YouTube.com live streams'
3128 _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'
3129 IE_NAME = 'youtube:live'
3130
3131 _TESTS = [{
3132 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
3133 'info_dict': {
3134 'id': 'a48o2S1cPoo',
3135 'ext': 'mp4',
3136 'title': 'The Young Turks - Live Main Show',
3137 'uploader': 'The Young Turks',
3138 'uploader_id': 'TheYoungTurks',
3139 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
3140 'upload_date': '20150715',
3141 'license': 'Standard YouTube License',
3142 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
3143 'categories': ['News & Politics'],
3144 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
3145 'like_count': int,
3146 'dislike_count': int,
3147 },
3148 'params': {
3149 'skip_download': True,
3150 },
3151 }, {
3152 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
3153 'only_matching': True,
3154 }, {
3155 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
3156 'only_matching': True,
3157 }, {
3158 'url': 'https://www.youtube.com/TheYoungTurks/live',
3159 'only_matching': True,
3160 }]
3161
3162 def _real_extract(self, url):
3163 mobj = re.match(self._VALID_URL, url)
3164 channel_id = mobj.group('id')
3165 base_url = mobj.group('base_url')
3166 webpage = self._download_webpage(url, channel_id, fatal=False)
3167 if webpage:
3168 page_type = self._og_search_property(
3169 'type', webpage, 'page type', default='')
3170 video_id = self._html_search_meta(
3171 'videoId', webpage, 'video id', default=None)
3172 if page_type.startswith('video') and video_id and re.match(
3173 r'^[0-9A-Za-z_-]{11}$', video_id):
3174 return self.url_result(video_id, YoutubeIE.ie_key())
3175 return self.url_result(base_url)
3176
3177
3178 class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
3179 IE_DESC = 'YouTube.com user/channel playlists'
3180 _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel|c)/(?P<id>[^/]+)/playlists'
3181 IE_NAME = 'youtube:playlists'
3182
3183 _TESTS = [{
3184 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
3185 'playlist_mincount': 4,
3186 'info_dict': {
3187 'id': 'ThirstForScience',
3188 'title': 'ThirstForScience',
3189 },
3190 }, {
3191 # with "Load more" button
3192 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
3193 'playlist_mincount': 70,
3194 'info_dict': {
3195 'id': 'igorkle1',
3196 'title': 'Игорь Клейнер',
3197 },
3198 }, {
3199 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',
3200 'playlist_mincount': 17,
3201 'info_dict': {
3202 'id': 'UCiU1dHvZObB2iP6xkJ__Icw',
3203 'title': 'Chem Player',
3204 },
3205 'skip': 'Blocked',
3206 }, {
3207 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
3208 'only_matching': True,
3209 }]
3210
3211
3212 class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):
3213 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'
3214
3215
3216 class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
3217 IE_DESC = 'YouTube.com searches'
3218 # there doesn't appear to be a real limit, for example if you search for
3219 # 'python' you get more than 8.000.000 results
3220 _MAX_RESULTS = float('inf')
3221 IE_NAME = 'youtube:search'
3222 _SEARCH_KEY = 'ytsearch'
3223 _SEARCH_PARAMS = None
3224 _TESTS = []
3225
3226 def _entries(self, query, n):
3227 data = {
3228 'context': {
3229 'client': {
3230 'clientName': 'WEB',
3231 'clientVersion': '2.20201021.03.00',
3232 }
3233 },
3234 'query': query,
3235 }
3236 if self._SEARCH_PARAMS:
3237 data['params'] = self._SEARCH_PARAMS
3238 total = 0
3239 for page_num in itertools.count(1):
3240 search = self._download_json(
3241 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
3242 video_id='query "%s"' % query,
3243 note='Downloading page %s' % page_num,
3244 errnote='Unable to download API page', fatal=False,
3245 data=json.dumps(data).encode('utf8'),
3246 headers={'content-type': 'application/json'})
3247 if not search:
3248 break
3249 slr_contents = try_get(
3250 search,
3251 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
3252 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
3253 list)
3254 if not slr_contents:
3255 break
3256 isr_contents = try_get(
3257 slr_contents,
3258 lambda x: x[0]['itemSectionRenderer']['contents'],
3259 list)
3260 if not isr_contents:
3261 break
3262 for content in isr_contents:
3263 if not isinstance(content, dict):
3264 continue
3265 video = content.get('videoRenderer')
3266 if not isinstance(video, dict):
3267 continue
3268 video_id = video.get('videoId')
3269 if not video_id:
3270 continue
3271 title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str)
3272 description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str)
3273 duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str))
3274 view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
3275 view_count = int_or_none(self._search_regex(
3276 r'^(\d+)', re.sub(r'\s', '', view_count_text),
3277 'view count', default=None))
3278 uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
3279 total += 1
3280 yield {
3281 '_type': 'url_transparent',
3282 'ie_key': YoutubeIE.ie_key(),
3283 'id': video_id,
3284 'url': video_id,
3285 'title': title,
3286 'description': description,
3287 'duration': duration,
3288 'view_count': view_count,
3289 'uploader': uploader,
3290 }
3291 if total == n:
3292 return
3293 token = try_get(
3294 slr_contents,
3295 lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
3296 compat_str)
3297 if not token:
3298 break
3299 data['continuation'] = token
3300
3301 def _get_n_results(self, query, n):
3302 """Get a specified number of results for a query"""
3303 return self.playlist_result(self._entries(query, n), query)
3304
3305
3306 class YoutubeSearchDateIE(YoutubeSearchIE):
3307 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
3308 _SEARCH_KEY = 'ytsearchdate'
3309 IE_DESC = 'YouTube.com searches, newest videos first'
3310 _SEARCH_PARAMS = 'CAI%3D'
3311
3312
3313 class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):
3314 IE_DESC = 'YouTube.com search URLs'
3315 IE_NAME = 'youtube:search_url'
3316 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
3317 _SEARCH_DATA = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});'
3318 _TESTS = [{
3319 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
3320 'playlist_mincount': 5,
3321 'info_dict': {
3322 'title': 'youtube-dl test video',
3323 }
3324 }, {
3325 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
3326 'only_matching': True,
3327 }]
3328
3329 def _find_videos_in_json(self, extracted):
3330 videos = []
3331
3332 def _real_find(obj):
3333 if obj is None or isinstance(obj, str):
3334 return
3335
3336 if type(obj) is list:
3337 for elem in obj:
3338 _real_find(elem)
3339
3340 if type(obj) is dict:
3341 if "videoId" in obj:
3342 videos.append(obj)
3343 return
3344
3345 for _, o in obj.items():
3346 _real_find(o)
3347
3348 _real_find(extracted)
3349
3350 return videos
3351
3352 def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page):
3353 search_response = self._parse_json(self._search_regex(self._SEARCH_DATA, page, 'ytInitialData'), None)
3354
3355 result_items = self._find_videos_in_json(search_response)
3356
3357 for renderer in result_items:
3358 video_id = try_get(renderer, lambda x: x['videoId'])
3359 video_title = try_get(renderer, lambda x: x['title']['runs'][0]['text']) or try_get(renderer, lambda x: x['title']['simpleText'])
3360
3361 if video_id is None or video_title is None:
3362 # we do not have a videoRenderer or title extraction broke
3363 continue
3364
3365 video_title = video_title.strip()
3366
3367 try:
3368 idx = ids_in_page.index(video_id)
3369 if video_title and not titles_in_page[idx]:
3370 titles_in_page[idx] = video_title
3371 except ValueError:
3372 ids_in_page.append(video_id)
3373 titles_in_page.append(video_title)
3374
3375 def extract_videos_from_page(self, page):
3376 ids_in_page = []
3377 titles_in_page = []
3378 self.extract_videos_from_page_impl(page, ids_in_page, titles_in_page)
3379 return zip(ids_in_page, titles_in_page)
3380
3381 def _real_extract(self, url):
3382 mobj = re.match(self._VALID_URL, url)
3383 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
3384 webpage = self._download_webpage(url, query)
3385 return self.playlist_result(self._process_page(webpage), playlist_title=query)
3386
3387
3388 class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
3389 IE_DESC = 'YouTube.com (multi-season) shows'
3390 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
3391 IE_NAME = 'youtube:show'
3392 _TESTS = [{
3393 'url': 'https://www.youtube.com/show/airdisasters',
3394 'playlist_mincount': 5,
3395 'info_dict': {
3396 'id': 'airdisasters',
3397 'title': 'Air Disasters',
3398 }
3399 }]
3400
3401 def _real_extract(self, url):
3402 playlist_id = self._match_id(url)
3403 return super(YoutubeShowIE, self)._real_extract(
3404 'https://www.youtube.com/show/%s/playlists' % playlist_id)
3405
3406
3407 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
3408 """
3409 Base class for feed extractors
3410 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
3411 """
3412 _LOGIN_REQUIRED = True
3413 _FEED_DATA = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});'
3414 _YTCFG_DATA = r"ytcfg.set\(({.*?})\)"
3415
3416 @property
3417 def IE_NAME(self):
3418 return 'youtube:%s' % self._FEED_NAME
3419
3420 def _real_initialize(self):
3421 self._login()
3422
3423 def _find_videos_in_json(self, extracted):
3424 videos = []
3425 c = {}
3426
3427 def _real_find(obj):
3428 if obj is None or isinstance(obj, str):
3429 return
3430
3431 if type(obj) is list:
3432 for elem in obj:
3433 _real_find(elem)
3434
3435 if type(obj) is dict:
3436 if "videoId" in obj:
3437 videos.append(obj)
3438 return
3439
3440 if "nextContinuationData" in obj:
3441 c["continuation"] = obj["nextContinuationData"]
3442 return
3443
3444 for _, o in obj.items():
3445 _real_find(o)
3446
3447 _real_find(extracted)
3448
3449 return videos, try_get(c, lambda x: x["continuation"])
3450
3451 def _entries(self, page):
3452 info = []
3453
3454 yt_conf = self._parse_json(self._search_regex(self._YTCFG_DATA, page, 'ytcfg.set', default="null"), None, fatal=False)
3455
3456 search_response = self._parse_json(self._search_regex(self._FEED_DATA, page, 'ytInitialData'), None)
3457
3458 for page_num in itertools.count(1):
3459 video_info, continuation = self._find_videos_in_json(search_response)
3460
3461 new_info = []
3462
3463 for v in video_info:
3464 v_id = try_get(v, lambda x: x['videoId'])
3465 if not v_id:
3466 continue
3467
3468 have_video = False
3469 for old in info:
3470 if old['videoId'] == v_id:
3471 have_video = True
3472 break
3473
3474 if not have_video:
3475 new_info.append(v)
3476
3477 if not new_info:
3478 break
3479
3480 info.extend(new_info)
3481
3482 for video in new_info:
3483 yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['runs'][0]['text']) or try_get(video, lambda x: x['title']['simpleText']))
3484
3485 if not continuation or not yt_conf:
3486 break
3487
3488 search_response = self._download_json(
3489 'https://www.youtube.com/browse_ajax', self._PLAYLIST_TITLE,
3490 'Downloading page #%s' % page_num,
3491 transform_source=uppercase_escape,
3492 query={
3493 "ctoken": try_get(continuation, lambda x: x["continuation"]),
3494 "continuation": try_get(continuation, lambda x: x["continuation"]),
3495 "itct": try_get(continuation, lambda x: x["clickTrackingParams"])
3496 },
3497 headers={
3498 "X-YouTube-Client-Name": try_get(yt_conf, lambda x: x["INNERTUBE_CONTEXT_CLIENT_NAME"]),
3499 "X-YouTube-Client-Version": try_get(yt_conf, lambda x: x["INNERTUBE_CONTEXT_CLIENT_VERSION"]),
3500 "X-Youtube-Identity-Token": try_get(yt_conf, lambda x: x["ID_TOKEN"]),
3501 "X-YouTube-Device": try_get(yt_conf, lambda x: x["DEVICE"]),
3502 "X-YouTube-Page-CL": try_get(yt_conf, lambda x: x["PAGE_CL"]),
3503 "X-YouTube-Page-Label": try_get(yt_conf, lambda x: x["PAGE_BUILD_LABEL"]),
3504 "X-YouTube-Variants-Checksum": try_get(yt_conf, lambda x: x["VARIANTS_CHECKSUM"]),
3505 })
3506
3507 def _real_extract(self, url):
3508 page = self._download_webpage(
3509 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
3510 self._PLAYLIST_TITLE)
3511 return self.playlist_result(
3512 self._entries(page), playlist_title=self._PLAYLIST_TITLE)
3513
3514
3515 class YoutubeWatchLaterIE(YoutubePlaylistIE):
3516 IE_NAME = 'youtube:watchlater'
3517 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
3518 _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'
3519
3520 _TESTS = [{
3521 'url': 'https://www.youtube.com/playlist?list=WL',
3522 'only_matching': True,
3523 }, {
3524 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',
3525 'only_matching': True,
3526 }]
3527
3528 def _real_extract(self, url):
3529 _, video = self._check_download_just_video(url, 'WL')
3530 if video:
3531 return video
3532 _, playlist = self._extract_playlist('WL')
3533 return playlist
3534
3535
3536 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
3537 IE_NAME = 'youtube:favorites'
3538 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
3539 _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
3540 _LOGIN_REQUIRED = True
3541
3542 def _real_extract(self, url):
3543 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
3544 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
3545 return self.url_result(playlist_id, 'YoutubePlaylist')
3546
3547
3548 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3549 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
3550 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'
3551 _FEED_NAME = 'recommended'
3552 _PLAYLIST_TITLE = 'Youtube Recommended videos'
3553
3554
3555 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
3556 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
3557 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
3558 _FEED_NAME = 'subscriptions'
3559 _PLAYLIST_TITLE = 'Youtube Subscriptions'
3560
3561
3562 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
3563 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
3564 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
3565 _FEED_NAME = 'history'
3566 _PLAYLIST_TITLE = 'Youtube History'
3567
3568
3569 class YoutubeTruncatedURLIE(InfoExtractor):
3570 IE_NAME = 'youtube:truncated_url'
3571 IE_DESC = False # Do not list
3572 _VALID_URL = r'''(?x)
3573 (?:https?://)?
3574 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3575 (?:watch\?(?:
3576 feature=[a-z_]+|
3577 annotation_id=annotation_[^&]+|
3578 x-yt-cl=[0-9]+|
3579 hl=[^&]*|
3580 t=[0-9]+
3581 )?
3582 |
3583 attribution_link\?a=[^&]+
3584 )
3585 $
3586 '''
3587
3588 _TESTS = [{
3589 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
3590 'only_matching': True,
3591 }, {
3592 'url': 'https://www.youtube.com/watch?',
3593 'only_matching': True,
3594 }, {
3595 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3596 'only_matching': True,
3597 }, {
3598 'url': 'https://www.youtube.com/watch?feature=foo',
3599 'only_matching': True,
3600 }, {
3601 'url': 'https://www.youtube.com/watch?hl=en-GB',
3602 'only_matching': True,
3603 }, {
3604 'url': 'https://www.youtube.com/watch?t=2372',
3605 'only_matching': True,
3606 }]
3607
3608 def _real_extract(self, url):
3609 raise ExtractorError(
3610 'Did you forget to quote the URL? Remember that & is a meta '
3611 'character in most shells, so you want to put the URL in quotes, '
3612 'like youtube-dl '
3613 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3614 ' or simply youtube-dl BaW_jenozKc .',
3615 expected=True)
3616
3617
3618 class YoutubeTruncatedIDIE(InfoExtractor):
3619 IE_NAME = 'youtube:truncated_id'
3620 IE_DESC = False # Do not list
3621 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
3622
3623 _TESTS = [{
3624 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3625 'only_matching': True,
3626 }]
3627
3628 def _real_extract(self, url):
3629 video_id = self._match_id(url)
3630 raise ExtractorError(
3631 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3632 expected=True)