]> jfr.im git - yt-dlp.git/blob - youtube_dlc/extractor/youtube.py
Update to release 2020.11.24 except youtube and skyit extractors
[yt-dlp.git] / youtube_dlc / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5
6 import itertools
7 import json
8 import os.path
9 import random
10 import re
11 import time
12 import traceback
13
14 from .common import InfoExtractor, SearchInfoExtractor
15 from ..jsinterp import JSInterpreter
16 from ..swfinterp import SWFInterpreter
17 from ..compat import (
18 compat_chr,
19 compat_kwargs,
20 compat_parse_qs,
21 compat_urllib_parse_unquote,
22 compat_urllib_parse_unquote_plus,
23 compat_urllib_parse_urlencode,
24 compat_urllib_parse_urlparse,
25 compat_urlparse,
26 compat_str,
27 )
28 from ..utils import (
29 bool_or_none,
30 clean_html,
31 error_to_compat_str,
32 ExtractorError,
33 float_or_none,
34 get_element_by_id,
35 int_or_none,
36 mimetype2ext,
37 parse_codecs,
38 parse_count,
39 parse_duration,
40 remove_quotes,
41 remove_start,
42 smuggle_url,
43 str_or_none,
44 str_to_int,
45 try_get,
46 unescapeHTML,
47 unified_strdate,
48 unsmuggle_url,
49 update_url_query,
50 uppercase_escape,
51 url_or_none,
52 urlencode_postdata,
53 urljoin,
54 )
55
56
57 class YoutubeBaseInfoExtractor(InfoExtractor):
58 """Provide base functions for Youtube extractors"""
59 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
60 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
61
62 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
63 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
64 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
65
66 _RESERVED_NAMES = (
67 r'course|embed|channel|c|user|playlist|watch|w|results|storefront|'
68 r'shared|index|account|reporthistory|t/terms|about|upload|signin|logout|'
69 r'feed/(watch_later|history|subscriptions|library|trending|recommended)')
70
71 _NETRC_MACHINE = 'youtube'
72 # If True it will raise an error if no login info is provided
73 _LOGIN_REQUIRED = False
74
75 _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM)'
76
77 _YOUTUBE_CLIENT_HEADERS = {
78 'x-youtube-client-name': '1',
79 'x-youtube-client-version': '1.20200609.04.02',
80 }
81
82 def _set_language(self):
83 self._set_cookie(
84 '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
85 # YouTube sets the expire time to about two months
86 expire_time=time.time() + 2 * 30 * 24 * 3600)
87
88 def _ids_to_results(self, ids):
89 return [
90 self.url_result(vid_id, 'Youtube', video_id=vid_id)
91 for vid_id in ids]
92
93 def _login(self):
94 """
95 Attempt to log in to YouTube.
96 True is returned if successful or skipped.
97 False is returned if login failed.
98
99 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
100 """
101 username, password = self._get_login_info()
102 # No authentication to be performed
103 if username is None:
104 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
105 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
106 if self._downloader.params.get('cookiefile') and False: # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
107 self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
108 return True
109
110 login_page = self._download_webpage(
111 self._LOGIN_URL, None,
112 note='Downloading login page',
113 errnote='unable to fetch login page', fatal=False)
114 if login_page is False:
115 return
116
117 login_form = self._hidden_inputs(login_page)
118
119 def req(url, f_req, note, errnote):
120 data = login_form.copy()
121 data.update({
122 'pstMsg': 1,
123 'checkConnection': 'youtube',
124 'checkedDomains': 'youtube',
125 'hl': 'en',
126 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
127 'f.req': json.dumps(f_req),
128 'flowName': 'GlifWebSignIn',
129 'flowEntry': 'ServiceLogin',
130 # TODO: reverse actual botguard identifier generation algo
131 'bgRequest': '["identifier",""]',
132 })
133 return self._download_json(
134 url, None, note=note, errnote=errnote,
135 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
136 fatal=False,
137 data=urlencode_postdata(data), headers={
138 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
139 'Google-Accounts-XSRF': 1,
140 })
141
142 def warn(message):
143 self._downloader.report_warning(message)
144
145 lookup_req = [
146 username,
147 None, [], None, 'US', None, None, 2, False, True,
148 [
149 None, None,
150 [2, 1, None, 1,
151 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
152 None, [], 4],
153 1, [None, None, []], None, None, None, True
154 ],
155 username,
156 ]
157
158 lookup_results = req(
159 self._LOOKUP_URL, lookup_req,
160 'Looking up account info', 'Unable to look up account info')
161
162 if lookup_results is False:
163 return False
164
165 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
166 if not user_hash:
167 warn('Unable to extract user hash')
168 return False
169
170 challenge_req = [
171 user_hash,
172 None, 1, None, [1, None, None, None, [password, None, True]],
173 [
174 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
175 1, [None, None, []], None, None, None, True
176 ]]
177
178 challenge_results = req(
179 self._CHALLENGE_URL, challenge_req,
180 'Logging in', 'Unable to log in')
181
182 if challenge_results is False:
183 return
184
185 login_res = try_get(challenge_results, lambda x: x[0][5], list)
186 if login_res:
187 login_msg = try_get(login_res, lambda x: x[5], compat_str)
188 warn(
189 'Unable to login: %s' % 'Invalid password'
190 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
191 return False
192
193 res = try_get(challenge_results, lambda x: x[0][-1], list)
194 if not res:
195 warn('Unable to extract result entry')
196 return False
197
198 login_challenge = try_get(res, lambda x: x[0][0], list)
199 if login_challenge:
200 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
201 if challenge_str == 'TWO_STEP_VERIFICATION':
202 # SEND_SUCCESS - TFA code has been successfully sent to phone
203 # QUOTA_EXCEEDED - reached the limit of TFA codes
204 status = try_get(login_challenge, lambda x: x[5], compat_str)
205 if status == 'QUOTA_EXCEEDED':
206 warn('Exceeded the limit of TFA codes, try later')
207 return False
208
209 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
210 if not tl:
211 warn('Unable to extract TL')
212 return False
213
214 tfa_code = self._get_tfa_info('2-step verification code')
215
216 if not tfa_code:
217 warn(
218 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
219 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
220 return False
221
222 tfa_code = remove_start(tfa_code, 'G-')
223
224 tfa_req = [
225 user_hash, None, 2, None,
226 [
227 9, None, None, None, None, None, None, None,
228 [None, tfa_code, True, 2]
229 ]]
230
231 tfa_results = req(
232 self._TFA_URL.format(tl), tfa_req,
233 'Submitting TFA code', 'Unable to submit TFA code')
234
235 if tfa_results is False:
236 return False
237
238 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
239 if tfa_res:
240 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
241 warn(
242 'Unable to finish TFA: %s' % 'Invalid TFA code'
243 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
244 return False
245
246 check_cookie_url = try_get(
247 tfa_results, lambda x: x[0][-1][2], compat_str)
248 else:
249 CHALLENGES = {
250 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
251 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
252 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
253 }
254 challenge = CHALLENGES.get(
255 challenge_str,
256 '%s returned error %s.' % (self.IE_NAME, challenge_str))
257 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
258 return False
259 else:
260 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
261
262 if not check_cookie_url:
263 warn('Unable to extract CheckCookie URL')
264 return False
265
266 check_cookie_results = self._download_webpage(
267 check_cookie_url, None, 'Checking cookie', fatal=False)
268
269 if check_cookie_results is False:
270 return False
271
272 if 'https://myaccount.google.com/' not in check_cookie_results:
273 warn('Unable to log in')
274 return False
275
276 return True
277
278 def _download_webpage_handle(self, *args, **kwargs):
279 query = kwargs.get('query', {}).copy()
280 kwargs['query'] = query
281 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
282 *args, **compat_kwargs(kwargs))
283
284 def _get_yt_initial_data(self, video_id, webpage):
285 config = self._search_regex(
286 (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
287 r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
288 webpage, 'ytInitialData', default=None)
289 if config:
290 return self._parse_json(
291 uppercase_escape(config), video_id, fatal=False)
292
293 def _real_initialize(self):
294 if self._downloader is None:
295 return
296 self._set_language()
297 if not self._login():
298 return
299
300 _DEFAULT_API_DATA = {
301 'context': {
302 'client': {
303 'clientName': 'WEB',
304 'clientVersion': '2.20201021.03.00',
305 }
306 },
307 }
308
309 _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
310
311 def _call_api(self, ep, query, video_id):
312 data = self._DEFAULT_API_DATA.copy()
313 data.update(query)
314
315 response = self._download_json(
316 'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id,
317 note='Downloading API JSON', errnote='Unable to download API page',
318 data=json.dumps(data).encode('utf8'),
319 headers={'content-type': 'application/json'},
320 query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'})
321
322 return response
323
324 def _extract_yt_initial_data(self, video_id, webpage):
325 return self._parse_json(
326 self._search_regex(
327 (r'%s\s*\n' % self._YT_INITIAL_DATA_RE,
328 self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
329 video_id)
330
331
332 class YoutubeIE(YoutubeBaseInfoExtractor):
333 IE_DESC = 'YouTube.com'
334 _VALID_URL = r"""(?x)^
335 (
336 (?:https?://|//) # http(s):// or protocol-independent URL
337 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
338 (?:www\.)?deturl\.com/www\.youtube\.com/|
339 (?:www\.)?pwnyoutube\.com/|
340 (?:www\.)?hooktube\.com/|
341 (?:www\.)?yourepeat\.com/|
342 tube\.majestyc\.net/|
343 # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
344 (?:(?:www|dev)\.)?invidio\.us/|
345 (?:(?:www|no)\.)?invidiou\.sh/|
346 (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/|
347 (?:www\.)?invidious\.kabi\.tk/|
348 (?:www\.)?invidious\.13ad\.de/|
349 (?:www\.)?invidious\.mastodon\.host/|
350 (?:www\.)?invidious\.nixnet\.xyz/|
351 (?:www\.)?invidious\.drycat\.fr/|
352 (?:www\.)?tube\.poal\.co/|
353 (?:www\.)?vid\.wxzm\.sx/|
354 (?:www\.)?yewtu\.be/|
355 (?:www\.)?yt\.elukerio\.org/|
356 (?:www\.)?yt\.lelux\.fi/|
357 (?:www\.)?invidious\.ggc-project\.de/|
358 (?:www\.)?yt\.maisputain\.ovh/|
359 (?:www\.)?invidious\.13ad\.de/|
360 (?:www\.)?invidious\.toot\.koeln/|
361 (?:www\.)?invidious\.fdn\.fr/|
362 (?:www\.)?watch\.nettohikari\.com/|
363 (?:www\.)?kgg2m7yk5aybusll\.onion/|
364 (?:www\.)?qklhadlycap4cnod\.onion/|
365 (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
366 (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
367 (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
368 (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
369 (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
370 (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
371 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
372 (?:.*?\#/)? # handle anchor (#/) redirect urls
373 (?: # the various things that can precede the ID:
374 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
375 |(?: # or the v= param in all its forms
376 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
377 (?:\?|\#!?) # the params delimiter ? or # or #!
378 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
379 v=
380 )
381 ))
382 |(?:
383 youtu\.be| # just youtu.be/xxxx
384 vid\.plus| # or vid.plus/xxxx
385 zwearz\.com/watch| # or zwearz.com/watch/xxxx
386 )/
387 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
388 )
389 )? # all until now is optional -> you can pass the naked ID
390 (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
391 (?!.*?\blist=
392 (?:
393 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
394 WL # WL are handled by the watch later IE
395 )
396 )
397 (?(1).+)? # if we found the ID, everything can follow
398 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
399 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
400 _PLAYER_INFO_RE = (
401 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P<ext>[a-z]+)$',
402 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.(?P<ext>[a-z]+)$',
403 )
404 _formats = {
405 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
406 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
407 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
408 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
409 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
410 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
411 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
412 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
413 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
414 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
415 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
416 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
417 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
418 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
419 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
420 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
421 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
422 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
423
424
425 # 3D videos
426 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
427 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
428 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
429 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
430 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
431 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
432 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
433
434 # Apple HTTP Live Streaming
435 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
436 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
437 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
438 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
439 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
440 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
441 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
442 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
443
444 # DASH mp4 video
445 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
446 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
447 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
448 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
449 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
450 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
451 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
452 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
453 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
454 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
455 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
456 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
457
458 # Dash mp4 audio
459 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
460 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
461 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
462 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
463 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
464 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
465 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
466
467 # Dash webm
468 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
469 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
470 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
471 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
472 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
473 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
474 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
475 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
476 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
477 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
478 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
479 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
480 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
481 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
482 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
483 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
484 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
485 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
486 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
487 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
488 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
489 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
490
491 # Dash webm audio
492 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
493 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
494
495 # Dash webm audio with opus inside
496 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
497 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
498 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
499
500 # RTMP (unnamed)
501 '_rtmp': {'protocol': 'rtmp'},
502
503 # av01 video only formats sometimes served with "unknown" codecs
504 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
505 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
506 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
507 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
508 }
509 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
510
511 _GEO_BYPASS = False
512
513 IE_NAME = 'youtube'
514 _TESTS = [
515 {
516 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
517 'info_dict': {
518 'id': 'BaW_jenozKc',
519 'ext': 'mp4',
520 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
521 'uploader': 'Philipp Hagemeister',
522 'uploader_id': 'phihag',
523 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
524 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
525 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
526 'upload_date': '20121002',
527 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
528 'categories': ['Science & Technology'],
529 'tags': ['youtube-dl'],
530 'duration': 10,
531 'view_count': int,
532 'like_count': int,
533 'dislike_count': int,
534 'start_time': 1,
535 'end_time': 9,
536 }
537 },
538 {
539 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
540 'note': 'Embed-only video (#1746)',
541 'info_dict': {
542 'id': 'yZIXLfi8CZQ',
543 'ext': 'mp4',
544 'upload_date': '20120608',
545 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
546 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
547 'uploader': 'SET India',
548 'uploader_id': 'setindia',
549 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
550 'age_limit': 18,
551 }
552 },
553 {
554 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ',
555 'note': 'Use the first video ID in the URL',
556 'info_dict': {
557 'id': 'BaW_jenozKc',
558 'ext': 'mp4',
559 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
560 'uploader': 'Philipp Hagemeister',
561 'uploader_id': 'phihag',
562 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
563 'upload_date': '20121002',
564 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
565 'categories': ['Science & Technology'],
566 'tags': ['youtube-dl'],
567 'duration': 10,
568 'view_count': int,
569 'like_count': int,
570 'dislike_count': int,
571 },
572 'params': {
573 'skip_download': True,
574 },
575 },
576 {
577 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
578 'note': '256k DASH audio (format 141) via DASH manifest',
579 'info_dict': {
580 'id': 'a9LDPn-MO4I',
581 'ext': 'm4a',
582 'upload_date': '20121002',
583 'uploader_id': '8KVIDEO',
584 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
585 'description': '',
586 'uploader': '8KVIDEO',
587 'title': 'UHDTV TEST 8K VIDEO.mp4'
588 },
589 'params': {
590 'youtube_include_dash_manifest': True,
591 'format': '141',
592 },
593 'skip': 'format 141 not served anymore',
594 },
595 # DASH manifest with encrypted signature
596 {
597 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
598 'info_dict': {
599 'id': 'IB3lcPjvWLA',
600 'ext': 'm4a',
601 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
602 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
603 'duration': 244,
604 'uploader': 'AfrojackVEVO',
605 'uploader_id': 'AfrojackVEVO',
606 'upload_date': '20131011',
607 },
608 'params': {
609 'youtube_include_dash_manifest': True,
610 'format': '141/bestaudio[ext=m4a]',
611 },
612 },
613 # Controversy video
614 {
615 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
616 'info_dict': {
617 'id': 'T4XJQO3qol8',
618 'ext': 'mp4',
619 'duration': 219,
620 'upload_date': '20100909',
621 'uploader': 'Amazing Atheist',
622 'uploader_id': 'TheAmazingAtheist',
623 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
624 'title': 'Burning Everyone\'s Koran',
625 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
626 }
627 },
628 # Normal age-gate video (embed allowed)
629 {
630 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
631 'info_dict': {
632 'id': 'HtVdAasjOgU',
633 'ext': 'mp4',
634 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
635 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
636 'duration': 142,
637 'uploader': 'The Witcher',
638 'uploader_id': 'WitcherGame',
639 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
640 'upload_date': '20140605',
641 'age_limit': 18,
642 },
643 },
644 # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
645 # YouTube Red ad is not captured for creator
646 {
647 'url': '__2ABJjxzNo',
648 'info_dict': {
649 'id': '__2ABJjxzNo',
650 'ext': 'mp4',
651 'duration': 266,
652 'upload_date': '20100430',
653 'uploader_id': 'deadmau5',
654 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
655 'creator': 'Dada Life, deadmau5',
656 'description': 'md5:12c56784b8032162bb936a5f76d55360',
657 'uploader': 'deadmau5',
658 'title': 'Deadmau5 - Some Chords (HD)',
659 'alt_title': 'This Machine Kills Some Chords',
660 },
661 'expected_warnings': [
662 'DASH manifest missing',
663 ]
664 },
665 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
666 {
667 'url': 'lqQg6PlCWgI',
668 'info_dict': {
669 'id': 'lqQg6PlCWgI',
670 'ext': 'mp4',
671 'duration': 6085,
672 'upload_date': '20150827',
673 'uploader_id': 'olympic',
674 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
675 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
676 'uploader': 'Olympic',
677 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
678 },
679 'params': {
680 'skip_download': 'requires avconv',
681 }
682 },
683 # Non-square pixels
684 {
685 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
686 'info_dict': {
687 'id': '_b-2C3KPAM0',
688 'ext': 'mp4',
689 'stretched_ratio': 16 / 9.,
690 'duration': 85,
691 'upload_date': '20110310',
692 'uploader_id': 'AllenMeow',
693 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
694 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
695 'uploader': '孫ᄋᄅ',
696 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
697 },
698 },
699 # url_encoded_fmt_stream_map is empty string
700 {
701 'url': 'qEJwOuvDf7I',
702 'info_dict': {
703 'id': 'qEJwOuvDf7I',
704 'ext': 'webm',
705 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
706 'description': '',
707 'upload_date': '20150404',
708 'uploader_id': 'spbelect',
709 'uploader': 'Наблюдатели Петербурга',
710 },
711 'params': {
712 'skip_download': 'requires avconv',
713 },
714 'skip': 'This live event has ended.',
715 },
716 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
717 {
718 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
719 'info_dict': {
720 'id': 'FIl7x6_3R5Y',
721 'ext': 'webm',
722 'title': 'md5:7b81415841e02ecd4313668cde88737a',
723 'description': 'md5:116377fd2963b81ec4ce64b542173306',
724 'duration': 220,
725 'upload_date': '20150625',
726 'uploader_id': 'dorappi2000',
727 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
728 'uploader': 'dorappi2000',
729 'formats': 'mincount:31',
730 },
731 'skip': 'not actual anymore',
732 },
733 # DASH manifest with segment_list
734 {
735 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
736 'md5': '8ce563a1d667b599d21064e982ab9e31',
737 'info_dict': {
738 'id': 'CsmdDsKjzN8',
739 'ext': 'mp4',
740 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
741 'uploader': 'Airtek',
742 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
743 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
744 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
745 },
746 'params': {
747 'youtube_include_dash_manifest': True,
748 'format': '135', # bestvideo
749 },
750 'skip': 'This live event has ended.',
751 },
752 {
753 # Multifeed videos (multiple cameras), URL is for Main Camera
754 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
755 'info_dict': {
756 'id': 'jqWvoWXjCVs',
757 'title': 'teamPGP: Rocket League Noob Stream',
758 'description': 'md5:dc7872fb300e143831327f1bae3af010',
759 },
760 'playlist': [{
761 'info_dict': {
762 'id': 'jqWvoWXjCVs',
763 'ext': 'mp4',
764 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
765 'description': 'md5:dc7872fb300e143831327f1bae3af010',
766 'duration': 7335,
767 'upload_date': '20150721',
768 'uploader': 'Beer Games Beer',
769 'uploader_id': 'beergamesbeer',
770 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
771 'license': 'Standard YouTube License',
772 },
773 }, {
774 'info_dict': {
775 'id': '6h8e8xoXJzg',
776 'ext': 'mp4',
777 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
778 'description': 'md5:dc7872fb300e143831327f1bae3af010',
779 'duration': 7337,
780 'upload_date': '20150721',
781 'uploader': 'Beer Games Beer',
782 'uploader_id': 'beergamesbeer',
783 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
784 'license': 'Standard YouTube License',
785 },
786 }, {
787 'info_dict': {
788 'id': 'PUOgX5z9xZw',
789 'ext': 'mp4',
790 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
791 'description': 'md5:dc7872fb300e143831327f1bae3af010',
792 'duration': 7337,
793 'upload_date': '20150721',
794 'uploader': 'Beer Games Beer',
795 'uploader_id': 'beergamesbeer',
796 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
797 'license': 'Standard YouTube License',
798 },
799 }, {
800 'info_dict': {
801 'id': 'teuwxikvS5k',
802 'ext': 'mp4',
803 'title': 'teamPGP: Rocket League Noob Stream (zim)',
804 'description': 'md5:dc7872fb300e143831327f1bae3af010',
805 'duration': 7334,
806 'upload_date': '20150721',
807 'uploader': 'Beer Games Beer',
808 'uploader_id': 'beergamesbeer',
809 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
810 'license': 'Standard YouTube License',
811 },
812 }],
813 'params': {
814 'skip_download': True,
815 },
816 'skip': 'This video is not available.',
817 },
818 {
819 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
820 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
821 'info_dict': {
822 'id': 'gVfLd0zydlo',
823 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
824 },
825 'playlist_count': 2,
826 'skip': 'Not multifeed anymore',
827 },
828 {
829 'url': 'https://vid.plus/FlRa-iH7PGw',
830 'only_matching': True,
831 },
832 {
833 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
834 'only_matching': True,
835 },
836 {
837 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
838 # Also tests cut-off URL expansion in video description (see
839 # https://github.com/ytdl-org/youtube-dl/issues/1892,
840 # https://github.com/ytdl-org/youtube-dl/issues/8164)
841 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
842 'info_dict': {
843 'id': 'lsguqyKfVQg',
844 'ext': 'mp4',
845 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
846 'alt_title': 'Dark Walk - Position Music',
847 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
848 'duration': 133,
849 'upload_date': '20151119',
850 'uploader_id': 'IronSoulElf',
851 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
852 'uploader': 'IronSoulElf',
853 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
854 'track': 'Dark Walk - Position Music',
855 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
856 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
857 },
858 'params': {
859 'skip_download': True,
860 },
861 },
862 {
863 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
864 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
865 'only_matching': True,
866 },
867 {
868 # Video with yt:stretch=17:0
869 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
870 'info_dict': {
871 'id': 'Q39EVAstoRM',
872 'ext': 'mp4',
873 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
874 'description': 'md5:ee18a25c350637c8faff806845bddee9',
875 'upload_date': '20151107',
876 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
877 'uploader': 'CH GAMER DROID',
878 },
879 'params': {
880 'skip_download': True,
881 },
882 'skip': 'This video does not exist.',
883 },
884 {
885 # Video licensed under Creative Commons
886 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
887 'info_dict': {
888 'id': 'M4gD1WSo5mA',
889 'ext': 'mp4',
890 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
891 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
892 'duration': 721,
893 'upload_date': '20150127',
894 'uploader_id': 'BerkmanCenter',
895 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
896 'uploader': 'The Berkman Klein Center for Internet & Society',
897 'license': 'Creative Commons Attribution license (reuse allowed)',
898 },
899 'params': {
900 'skip_download': True,
901 },
902 },
903 {
904 # Channel-like uploader_url
905 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
906 'info_dict': {
907 'id': 'eQcmzGIKrzg',
908 'ext': 'mp4',
909 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
910 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
911 'duration': 4060,
912 'upload_date': '20151119',
913 'uploader': 'Bernie Sanders',
914 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
915 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
916 'license': 'Creative Commons Attribution license (reuse allowed)',
917 },
918 'params': {
919 'skip_download': True,
920 },
921 },
922 {
923 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
924 'only_matching': True,
925 },
926 {
927 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
928 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
929 'only_matching': True,
930 },
931 {
932 # Rental video preview
933 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
934 'info_dict': {
935 'id': 'uGpuVWrhIzE',
936 'ext': 'mp4',
937 'title': 'Piku - Trailer',
938 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
939 'upload_date': '20150811',
940 'uploader': 'FlixMatrix',
941 'uploader_id': 'FlixMatrixKaravan',
942 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
943 'license': 'Standard YouTube License',
944 },
945 'params': {
946 'skip_download': True,
947 },
948 'skip': 'This video is not available.',
949 },
950 {
951 # YouTube Red video with episode data
952 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
953 'info_dict': {
954 'id': 'iqKdEhx-dD4',
955 'ext': 'mp4',
956 'title': 'Isolation - Mind Field (Ep 1)',
957 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
958 'duration': 2085,
959 'upload_date': '20170118',
960 'uploader': 'Vsauce',
961 'uploader_id': 'Vsauce',
962 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
963 'series': 'Mind Field',
964 'season_number': 1,
965 'episode_number': 1,
966 },
967 'params': {
968 'skip_download': True,
969 },
970 'expected_warnings': [
971 'Skipping DASH manifest',
972 ],
973 },
974 {
975 # The following content has been identified by the YouTube community
976 # as inappropriate or offensive to some audiences.
977 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
978 'info_dict': {
979 'id': '6SJNVb0GnPI',
980 'ext': 'mp4',
981 'title': 'Race Differences in Intelligence',
982 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
983 'duration': 965,
984 'upload_date': '20140124',
985 'uploader': 'New Century Foundation',
986 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
987 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
988 },
989 'params': {
990 'skip_download': True,
991 },
992 },
993 {
994 # itag 212
995 'url': '1t24XAntNCY',
996 'only_matching': True,
997 },
998 {
999 # geo restricted to JP
1000 'url': 'sJL6WA-aGkQ',
1001 'only_matching': True,
1002 },
1003 {
1004 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1005 'only_matching': True,
1006 },
1007 {
1008 # DRM protected
1009 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1010 'only_matching': True,
1011 },
1012 {
1013 # Video with unsupported adaptive stream type formats
1014 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1015 'info_dict': {
1016 'id': 'Z4Vy8R84T1U',
1017 'ext': 'mp4',
1018 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1019 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1020 'duration': 433,
1021 'upload_date': '20130923',
1022 'uploader': 'Amelia Putri Harwita',
1023 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1024 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1025 'formats': 'maxcount:10',
1026 },
1027 'params': {
1028 'skip_download': True,
1029 'youtube_include_dash_manifest': False,
1030 },
1031 'skip': 'not actual anymore',
1032 },
1033 {
1034 # Youtube Music Auto-generated description
1035 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1036 'info_dict': {
1037 'id': 'MgNrAu2pzNs',
1038 'ext': 'mp4',
1039 'title': 'Voyeur Girl',
1040 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1041 'upload_date': '20190312',
1042 'uploader': 'Stephen - Topic',
1043 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
1044 'artist': 'Stephen',
1045 'track': 'Voyeur Girl',
1046 'album': 'it\'s too much love to know my dear',
1047 'release_date': '20190313',
1048 'release_year': 2019,
1049 },
1050 'params': {
1051 'skip_download': True,
1052 },
1053 },
1054 {
1055 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1056 'only_matching': True,
1057 },
1058 {
1059 # invalid -> valid video id redirection
1060 'url': 'DJztXj2GPfl',
1061 'info_dict': {
1062 'id': 'DJztXj2GPfk',
1063 'ext': 'mp4',
1064 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1065 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1066 'upload_date': '20090125',
1067 'uploader': 'Prochorowka',
1068 'uploader_id': 'Prochorowka',
1069 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1070 'artist': 'Panjabi MC',
1071 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1072 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1073 },
1074 'params': {
1075 'skip_download': True,
1076 },
1077 },
1078 {
1079 # empty description results in an empty string
1080 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1081 'info_dict': {
1082 'id': 'x41yOUIvK2k',
1083 'ext': 'mp4',
1084 'title': 'IMG 3456',
1085 'description': '',
1086 'upload_date': '20170613',
1087 'uploader_id': 'ElevageOrVert',
1088 'uploader': 'ElevageOrVert',
1089 },
1090 'params': {
1091 'skip_download': True,
1092 },
1093 },
1094 {
1095 # with '};' inside yt initial data (see https://github.com/ytdl-org/youtube-dl/issues/27093)
1096 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no',
1097 'info_dict': {
1098 'id': 'CHqg6qOn4no',
1099 'ext': 'mp4',
1100 'title': 'Part 77 Sort a list of simple types in c#',
1101 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc',
1102 'upload_date': '20130831',
1103 'uploader_id': 'kudvenkat',
1104 'uploader': 'kudvenkat',
1105 },
1106 'params': {
1107 'skip_download': True,
1108 },
1109 },
1110 ]
1111
1112 def __init__(self, *args, **kwargs):
1113 super(YoutubeIE, self).__init__(*args, **kwargs)
1114 self._player_cache = {}
1115
1116 def report_video_info_webpage_download(self, video_id):
1117 """Report attempt to download video info webpage."""
1118 self.to_screen('%s: Downloading video info webpage' % video_id)
1119
1120 def report_information_extraction(self, video_id):
1121 """Report attempt to extract video information."""
1122 self.to_screen('%s: Extracting video information' % video_id)
1123
1124 def report_unavailable_format(self, video_id, format):
1125 """Report extracted video URL."""
1126 self.to_screen('%s: Format %s not available' % (video_id, format))
1127
1128 def report_rtmp_download(self):
1129 """Indicate the download will use the RTMP protocol."""
1130 self.to_screen('RTMP download detected')
1131
1132 def _signature_cache_id(self, example_sig):
1133 """ Return a string representation of a signature """
1134 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
1135
1136 @classmethod
1137 def _extract_player_info(cls, player_url):
1138 for player_re in cls._PLAYER_INFO_RE:
1139 id_m = re.search(player_re, player_url)
1140 if id_m:
1141 break
1142 else:
1143 raise ExtractorError('Cannot identify player %r' % player_url)
1144 return id_m.group('ext'), id_m.group('id')
1145
1146 def _extract_signature_function(self, video_id, player_url, example_sig):
1147 player_type, player_id = self._extract_player_info(player_url)
1148
1149 # Read from filesystem cache
1150 func_id = '%s_%s_%s' % (
1151 player_type, player_id, self._signature_cache_id(example_sig))
1152 assert os.path.basename(func_id) == func_id
1153
1154 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
1155 if cache_spec is not None:
1156 return lambda s: ''.join(s[i] for i in cache_spec)
1157
1158 download_note = (
1159 'Downloading player %s' % player_url
1160 if self._downloader.params.get('verbose') else
1161 'Downloading %s player %s' % (player_type, player_id)
1162 )
1163 if player_type == 'js':
1164 code = self._download_webpage(
1165 player_url, video_id,
1166 note=download_note,
1167 errnote='Download of %s failed' % player_url)
1168 res = self._parse_sig_js(code)
1169 elif player_type == 'swf':
1170 urlh = self._request_webpage(
1171 player_url, video_id,
1172 note=download_note,
1173 errnote='Download of %s failed' % player_url)
1174 code = urlh.read()
1175 res = self._parse_sig_swf(code)
1176 else:
1177 assert False, 'Invalid player type %r' % player_type
1178
1179 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1180 cache_res = res(test_string)
1181 cache_spec = [ord(c) for c in cache_res]
1182
1183 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
1184 return res
1185
1186 def _print_sig_code(self, func, example_sig):
1187 def gen_sig_code(idxs):
1188 def _genslice(start, end, step):
1189 starts = '' if start == 0 else str(start)
1190 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
1191 steps = '' if step == 1 else (':%d' % step)
1192 return 's[%s%s%s]' % (starts, ends, steps)
1193
1194 step = None
1195 # Quelch pyflakes warnings - start will be set when step is set
1196 start = '(Never used)'
1197 for i, prev in zip(idxs[1:], idxs[:-1]):
1198 if step is not None:
1199 if i - prev == step:
1200 continue
1201 yield _genslice(start, prev, step)
1202 step = None
1203 continue
1204 if i - prev in [-1, 1]:
1205 step = i - prev
1206 start = prev
1207 continue
1208 else:
1209 yield 's[%d]' % prev
1210 if step is None:
1211 yield 's[%d]' % i
1212 else:
1213 yield _genslice(start, i, step)
1214
1215 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1216 cache_res = func(test_string)
1217 cache_spec = [ord(c) for c in cache_res]
1218 expr_code = ' + '.join(gen_sig_code(cache_spec))
1219 signature_id_tuple = '(%s)' % (
1220 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
1221 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
1222 ' return %s\n') % (signature_id_tuple, expr_code)
1223 self.to_screen('Extracted signature function:\n' + code)
1224
1225 def _parse_sig_js(self, jscode):
1226 funcname = self._search_regex(
1227 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1228 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1229 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1230 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1231 # Obsolete patterns
1232 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1233 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
1234 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1235 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1236 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1237 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1238 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1239 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
1240 jscode, 'Initial JS player signature function name', group='sig')
1241
1242 jsi = JSInterpreter(jscode)
1243 initial_function = jsi.extract_function(funcname)
1244 return lambda s: initial_function([s])
1245
1246 def _parse_sig_swf(self, file_contents):
1247 swfi = SWFInterpreter(file_contents)
1248 TARGET_CLASSNAME = 'SignatureDecipher'
1249 searched_class = swfi.extract_class(TARGET_CLASSNAME)
1250 initial_function = swfi.extract_function(searched_class, 'decipher')
1251 return lambda s: initial_function([s])
1252
1253 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1254 """Turn the encrypted s field into a working signature"""
1255
1256 if player_url is None:
1257 raise ExtractorError('Cannot decrypt signature without player_url')
1258
1259 if player_url.startswith('//'):
1260 player_url = 'https:' + player_url
1261 elif not re.match(r'https?://', player_url):
1262 player_url = compat_urlparse.urljoin(
1263 'https://www.youtube.com', player_url)
1264 try:
1265 player_id = (player_url, self._signature_cache_id(s))
1266 if player_id not in self._player_cache:
1267 func = self._extract_signature_function(
1268 video_id, player_url, s
1269 )
1270 self._player_cache[player_id] = func
1271 func = self._player_cache[player_id]
1272 if self._downloader.params.get('youtube_print_sig_code'):
1273 self._print_sig_code(func, s)
1274 return func(s)
1275 except Exception as e:
1276 tb = traceback.format_exc()
1277 raise ExtractorError(
1278 'Signature extraction failed: ' + tb, cause=e)
1279
1280 def _get_subtitles(self, video_id, webpage, has_live_chat_replay):
1281 try:
1282 subs_doc = self._download_xml(
1283 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1284 video_id, note=False)
1285 except ExtractorError as err:
1286 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
1287 return {}
1288
1289 sub_lang_list = {}
1290 for track in subs_doc.findall('track'):
1291 lang = track.attrib['lang_code']
1292 if lang in sub_lang_list:
1293 continue
1294 sub_formats = []
1295 for ext in self._SUBTITLE_FORMATS:
1296 params = compat_urllib_parse_urlencode({
1297 'lang': lang,
1298 'v': video_id,
1299 'fmt': ext,
1300 'name': track.attrib['name'].encode('utf-8'),
1301 })
1302 sub_formats.append({
1303 'url': 'https://www.youtube.com/api/timedtext?' + params,
1304 'ext': ext,
1305 })
1306 sub_lang_list[lang] = sub_formats
1307 if has_live_chat_replay:
1308 sub_lang_list['live_chat'] = [
1309 {
1310 'video_id': video_id,
1311 'ext': 'json',
1312 'protocol': 'youtube_live_chat_replay',
1313 },
1314 ]
1315 if not sub_lang_list:
1316 self._downloader.report_warning('video doesn\'t have subtitles')
1317 return {}
1318 return sub_lang_list
1319
1320 def _get_ytplayer_config(self, video_id, webpage):
1321 patterns = (
1322 # User data may contain arbitrary character sequences that may affect
1323 # JSON extraction with regex, e.g. when '};' is contained the second
1324 # regex won't capture the whole JSON. Yet working around by trying more
1325 # concrete regex first keeping in mind proper quoted string handling
1326 # to be implemented in future that will replace this workaround (see
1327 # https://github.com/ytdl-org/youtube-dl/issues/7468,
1328 # https://github.com/ytdl-org/youtube-dl/pull/7599)
1329 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1330 r';ytplayer\.config\s*=\s*({.+?});',
1331 r'ytInitialPlayerResponse\s*=\s*({.+?});var meta' # Needed???
1332 )
1333 config = self._search_regex(
1334 patterns, webpage, 'ytplayer.config', default=None)
1335 if config:
1336 return self._parse_json(
1337 uppercase_escape(config), video_id, fatal=False)
1338
1339 def _get_music_metadata_from_yt_initial(self, yt_initial):
1340 music_metadata = []
1341 key_map = {
1342 'Album': 'album',
1343 'Artist': 'artist',
1344 'Song': 'track'
1345 }
1346 contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'])
1347 if type(contents) is list:
1348 for content in contents:
1349 music_track = {}
1350 if type(content) is not dict:
1351 continue
1352 videoSecondaryInfoRenderer = try_get(content, lambda x: x['videoSecondaryInfoRenderer'])
1353 if type(videoSecondaryInfoRenderer) is not dict:
1354 continue
1355 rows = try_get(videoSecondaryInfoRenderer, lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'])
1356 if type(rows) is not list:
1357 continue
1358 for row in rows:
1359 metadataRowRenderer = try_get(row, lambda x: x['metadataRowRenderer'])
1360 if type(metadataRowRenderer) is not dict:
1361 continue
1362 key = try_get(metadataRowRenderer, lambda x: x['title']['simpleText'])
1363 value = try_get(metadataRowRenderer, lambda x: x['contents'][0]['simpleText']) or \
1364 try_get(metadataRowRenderer, lambda x: x['contents'][0]['runs'][0]['text'])
1365 if type(key) is not str or type(value) is not str:
1366 continue
1367 if key in key_map:
1368 if key_map[key] in music_track:
1369 # we've started on a new track
1370 music_metadata.append(music_track)
1371 music_track = {}
1372 music_track[key_map[key]] = value
1373 if len(music_track.keys()):
1374 music_metadata.append(music_track)
1375 return music_metadata
1376
1377 def _get_automatic_captions(self, video_id, webpage):
1378 """We need the webpage for getting the captions url, pass it as an
1379 argument to speed up the process."""
1380 self.to_screen('%s: Looking for automatic captions' % video_id)
1381 player_config = self._get_ytplayer_config(video_id, webpage)
1382 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
1383 if not player_config:
1384 self._downloader.report_warning(err_msg)
1385 return {}
1386 try:
1387 args = player_config['args']
1388 caption_url = args.get('ttsurl')
1389 if caption_url:
1390 timestamp = args['timestamp']
1391 # We get the available subtitles
1392 list_params = compat_urllib_parse_urlencode({
1393 'type': 'list',
1394 'tlangs': 1,
1395 'asrs': 1,
1396 })
1397 list_url = caption_url + '&' + list_params
1398 caption_list = self._download_xml(list_url, video_id)
1399 original_lang_node = caption_list.find('track')
1400 if original_lang_node is None:
1401 self._downloader.report_warning('Video doesn\'t have automatic captions')
1402 return {}
1403 original_lang = original_lang_node.attrib['lang_code']
1404 caption_kind = original_lang_node.attrib.get('kind', '')
1405
1406 sub_lang_list = {}
1407 for lang_node in caption_list.findall('target'):
1408 sub_lang = lang_node.attrib['lang_code']
1409 sub_formats = []
1410 for ext in self._SUBTITLE_FORMATS:
1411 params = compat_urllib_parse_urlencode({
1412 'lang': original_lang,
1413 'tlang': sub_lang,
1414 'fmt': ext,
1415 'ts': timestamp,
1416 'kind': caption_kind,
1417 })
1418 sub_formats.append({
1419 'url': caption_url + '&' + params,
1420 'ext': ext,
1421 })
1422 sub_lang_list[sub_lang] = sub_formats
1423 return sub_lang_list
1424
1425 def make_captions(sub_url, sub_langs):
1426 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1427 caption_qs = compat_parse_qs(parsed_sub_url.query)
1428 captions = {}
1429 for sub_lang in sub_langs:
1430 sub_formats = []
1431 for ext in self._SUBTITLE_FORMATS:
1432 caption_qs.update({
1433 'tlang': [sub_lang],
1434 'fmt': [ext],
1435 })
1436 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1437 query=compat_urllib_parse_urlencode(caption_qs, True)))
1438 sub_formats.append({
1439 'url': sub_url,
1440 'ext': ext,
1441 })
1442 captions[sub_lang] = sub_formats
1443 return captions
1444
1445 # New captions format as of 22.06.2017
1446 player_response = args.get('player_response')
1447 if player_response and isinstance(player_response, compat_str):
1448 player_response = self._parse_json(
1449 player_response, video_id, fatal=False)
1450 if player_response:
1451 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1452 base_url = renderer['captionTracks'][0]['baseUrl']
1453 sub_lang_list = []
1454 for lang in renderer['translationLanguages']:
1455 lang_code = lang.get('languageCode')
1456 if lang_code:
1457 sub_lang_list.append(lang_code)
1458 return make_captions(base_url, sub_lang_list)
1459
1460 # Some videos don't provide ttsurl but rather caption_tracks and
1461 # caption_translation_languages (e.g. 20LmZk1hakA)
1462 # Does not used anymore as of 22.06.2017
1463 caption_tracks = args['caption_tracks']
1464 caption_translation_languages = args['caption_translation_languages']
1465 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
1466 sub_lang_list = []
1467 for lang in caption_translation_languages.split(','):
1468 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1469 sub_lang = lang_qs.get('lc', [None])[0]
1470 if sub_lang:
1471 sub_lang_list.append(sub_lang)
1472 return make_captions(caption_url, sub_lang_list)
1473 # An extractor error can be raise by the download process if there are
1474 # no automatic captions but there are subtitles
1475 except (KeyError, IndexError, ExtractorError):
1476 self._downloader.report_warning(err_msg)
1477 return {}
1478
1479 def _mark_watched(self, video_id, video_info, player_response):
1480 playback_url = url_or_none(try_get(
1481 player_response,
1482 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
1483 video_info, lambda x: x['videostats_playback_base_url'][0]))
1484 if not playback_url:
1485 return
1486 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1487 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1488
1489 # cpn generation algorithm is reverse engineered from base.js.
1490 # In fact it works even with dummy cpn.
1491 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1492 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1493
1494 qs.update({
1495 'ver': ['2'],
1496 'cpn': [cpn],
1497 })
1498 playback_url = compat_urlparse.urlunparse(
1499 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
1500
1501 self._download_webpage(
1502 playback_url, video_id, 'Marking watched',
1503 'Unable to mark watched', fatal=False)
1504
1505 @staticmethod
1506 def _extract_urls(webpage):
1507 # Embedded YouTube player
1508 entries = [
1509 unescapeHTML(mobj.group('url'))
1510 for mobj in re.finditer(r'''(?x)
1511 (?:
1512 <iframe[^>]+?src=|
1513 data-video-url=|
1514 <embed[^>]+?src=|
1515 embedSWF\(?:\s*|
1516 <object[^>]+data=|
1517 new\s+SWFObject\(
1518 )
1519 (["\'])
1520 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1521 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
1522 \1''', webpage)]
1523
1524 # lazyYT YouTube embed
1525 entries.extend(list(map(
1526 unescapeHTML,
1527 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1528
1529 # Wordpress "YouTube Video Importer" plugin
1530 matches = re.findall(r'''(?x)<div[^>]+
1531 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1532 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1533 entries.extend(m[-1] for m in matches)
1534
1535 return entries
1536
1537 @staticmethod
1538 def _extract_url(webpage):
1539 urls = YoutubeIE._extract_urls(webpage)
1540 return urls[0] if urls else None
1541
1542 @classmethod
1543 def extract_id(cls, url):
1544 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1545 if mobj is None:
1546 raise ExtractorError('Invalid URL: %s' % url)
1547 video_id = mobj.group(2)
1548 return video_id
1549
1550 def _extract_chapters_from_json(self, webpage, video_id, duration):
1551 if not webpage:
1552 return
1553 data = self._extract_yt_initial_data(video_id, webpage)
1554 if not data or not isinstance(data, dict):
1555 return
1556 chapters_list = try_get(
1557 data,
1558 lambda x: x['playerOverlays']
1559 ['playerOverlayRenderer']
1560 ['decoratedPlayerBarRenderer']
1561 ['decoratedPlayerBarRenderer']
1562 ['playerBar']
1563 ['chapteredPlayerBarRenderer']
1564 ['chapters'],
1565 list)
1566 if not chapters_list:
1567 return
1568
1569 def chapter_time(chapter):
1570 return float_or_none(
1571 try_get(
1572 chapter,
1573 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1574 int),
1575 scale=1000)
1576 chapters = []
1577 for next_num, chapter in enumerate(chapters_list, start=1):
1578 start_time = chapter_time(chapter)
1579 if start_time is None:
1580 continue
1581 end_time = (chapter_time(chapters_list[next_num])
1582 if next_num < len(chapters_list) else duration)
1583 if end_time is None:
1584 continue
1585 title = try_get(
1586 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1587 compat_str)
1588 chapters.append({
1589 'start_time': start_time,
1590 'end_time': end_time,
1591 'title': title,
1592 })
1593 return chapters
1594
1595 @staticmethod
1596 def _extract_chapters_from_description(description, duration):
1597 if not description:
1598 return None
1599 chapter_lines = re.findall(
1600 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1601 description)
1602 if not chapter_lines:
1603 return None
1604 chapters = []
1605 for next_num, (chapter_line, time_point) in enumerate(
1606 chapter_lines, start=1):
1607 start_time = parse_duration(time_point)
1608 if start_time is None:
1609 continue
1610 if start_time > duration:
1611 break
1612 end_time = (duration if next_num == len(chapter_lines)
1613 else parse_duration(chapter_lines[next_num][1]))
1614 if end_time is None:
1615 continue
1616 if end_time > duration:
1617 end_time = duration
1618 if start_time > end_time:
1619 break
1620 chapter_title = re.sub(
1621 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1622 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1623 chapters.append({
1624 'start_time': start_time,
1625 'end_time': end_time,
1626 'title': chapter_title,
1627 })
1628 return chapters
1629
1630 def _extract_chapters(self, webpage, description, video_id, duration):
1631 return (self._extract_chapters_from_json(webpage, video_id, duration)
1632 or self._extract_chapters_from_description(description, duration))
1633
1634 def _real_extract(self, url):
1635 url, smuggled_data = unsmuggle_url(url, {})
1636
1637 proto = (
1638 'http' if self._downloader.params.get('prefer_insecure', False)
1639 else 'https')
1640
1641 start_time = None
1642 end_time = None
1643 parsed_url = compat_urllib_parse_urlparse(url)
1644 for component in [parsed_url.fragment, parsed_url.query]:
1645 query = compat_parse_qs(component)
1646 if start_time is None and 't' in query:
1647 start_time = parse_duration(query['t'][0])
1648 if start_time is None and 'start' in query:
1649 start_time = parse_duration(query['start'][0])
1650 if end_time is None and 'end' in query:
1651 end_time = parse_duration(query['end'][0])
1652
1653 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1654 mobj = re.search(self._NEXT_URL_RE, url)
1655 if mobj:
1656 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
1657 video_id = self.extract_id(url)
1658
1659 # Get video webpage
1660 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
1661 video_webpage, urlh = self._download_webpage_handle(url, video_id)
1662
1663 qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query)
1664 video_id = qs.get('v', [None])[0] or video_id
1665
1666 # Attempt to extract SWF player URL
1667 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1668 if mobj is not None:
1669 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1670 else:
1671 player_url = None
1672
1673 dash_mpds = []
1674
1675 def add_dash_mpd(video_info):
1676 dash_mpd = video_info.get('dashmpd')
1677 if dash_mpd and dash_mpd[0] not in dash_mpds:
1678 dash_mpds.append(dash_mpd[0])
1679
1680 def add_dash_mpd_pr(pl_response):
1681 dash_mpd = url_or_none(try_get(
1682 pl_response, lambda x: x['streamingData']['dashManifestUrl'],
1683 compat_str))
1684 if dash_mpd and dash_mpd not in dash_mpds:
1685 dash_mpds.append(dash_mpd)
1686
1687 is_live = None
1688 view_count = None
1689
1690 def extract_view_count(v_info):
1691 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1692
1693 def extract_player_response(player_response, video_id):
1694 pl_response = str_or_none(player_response)
1695 if not pl_response:
1696 return
1697 pl_response = self._parse_json(pl_response, video_id, fatal=False)
1698 if isinstance(pl_response, dict):
1699 add_dash_mpd_pr(pl_response)
1700 return pl_response
1701
1702 def extract_embedded_config(embed_webpage, video_id):
1703 embedded_config = self._search_regex(
1704 r'setConfig\(({.*})\);',
1705 embed_webpage, 'ytInitialData', default=None)
1706 if embedded_config:
1707 return embedded_config
1708
1709 player_response = {}
1710
1711 # Get video info
1712 video_info = {}
1713 embed_webpage = None
1714 if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+'
1715 or re.search(r'player-age-gate-content">', video_webpage) is not None):
1716 cookie_keys = self._get_cookies('https://www.youtube.com').keys()
1717 age_gate = True
1718 # We simulate the access to the video from www.youtube.com/v/{video_id}
1719 # this can be viewed without login into Youtube
1720 url = proto + '://www.youtube.com/embed/%s' % video_id
1721 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
1722 ext = extract_embedded_config(embed_webpage, video_id)
1723 # playabilityStatus = re.search(r'{\\\"status\\\":\\\"(?P<playabilityStatus>[^\"]+)\\\"', ext)
1724 playable_in_embed = re.search(r'{\\\"playableInEmbed\\\":(?P<playableinEmbed>[^\,]+)', ext)
1725 if not playable_in_embed:
1726 self.to_screen('Could not determine whether playabale in embed for video %s' % video_id)
1727 playable_in_embed = ''
1728 else:
1729 playable_in_embed = playable_in_embed.group('playableinEmbed')
1730 # check if video is only playable on youtube in other words not playable in embed - if so it requires auth (cookies)
1731 # if re.search(r'player-unavailable">', embed_webpage) is not None:
1732 if playable_in_embed == 'false':
1733 '''
1734 # TODO apply this patch when Support for Python 2.6(!) and above drops
1735 if ({'VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID'} <= cookie_keys
1736 or {'VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO'} <= cookie_keys):
1737 '''
1738 if (set(('VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID')) <= set(cookie_keys)
1739 or set(('VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO')) <= set(cookie_keys)):
1740 age_gate = False
1741 # Try looking directly into the video webpage
1742 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1743 if ytplayer_config:
1744 args = ytplayer_config.get("args")
1745 if args is not None:
1746 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1747 # Convert to the same format returned by compat_parse_qs
1748 video_info = dict((k, [v]) for k, v in args.items())
1749 add_dash_mpd(video_info)
1750 # Rental video is not rented but preview is available (e.g.
1751 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1752 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1753 if not video_info and args.get('ypc_vid'):
1754 return self.url_result(
1755 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1756 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1757 is_live = True
1758 if not player_response:
1759 player_response = extract_player_response(args.get('player_response'), video_id)
1760 elif not player_response:
1761 player_response = ytplayer_config
1762 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1763 add_dash_mpd_pr(player_response)
1764 else:
1765 raise ExtractorError('Video is age restricted and only playable on Youtube. Requires cookies!', expected=True)
1766 else:
1767 data = compat_urllib_parse_urlencode({
1768 'video_id': video_id,
1769 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1770 'sts': self._search_regex(
1771 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1772 })
1773 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1774 try:
1775 video_info_webpage = self._download_webpage(
1776 video_info_url, video_id,
1777 note='Refetching age-gated info webpage',
1778 errnote='unable to download video info webpage')
1779 except ExtractorError:
1780 video_info_webpage = None
1781 if video_info_webpage:
1782 video_info = compat_parse_qs(video_info_webpage)
1783 pl_response = video_info.get('player_response', [None])[0]
1784 player_response = extract_player_response(pl_response, video_id)
1785 add_dash_mpd(video_info)
1786 view_count = extract_view_count(video_info)
1787 else:
1788 age_gate = False
1789 # Try looking directly into the video webpage
1790 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1791 if ytplayer_config:
1792 args = ytplayer_config.get('args', {})
1793 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1794 # Convert to the same format returned by compat_parse_qs
1795 video_info = dict((k, [v]) for k, v in args.items())
1796 add_dash_mpd(video_info)
1797 # Rental video is not rented but preview is available (e.g.
1798 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1799 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1800 if not video_info and args.get('ypc_vid'):
1801 return self.url_result(
1802 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1803 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1804 is_live = True
1805 if not player_response:
1806 player_response = extract_player_response(args.get('player_response'), video_id)
1807 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1808 add_dash_mpd_pr(player_response)
1809
1810 if not video_info and not player_response:
1811 player_response = extract_player_response(
1812 self._search_regex(
1813 r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;', video_webpage,
1814 'initial player response', default='{}'),
1815 video_id)
1816
1817 def extract_unavailable_message():
1818 messages = []
1819 for tag, kind in (('h1', 'message'), ('div', 'submessage')):
1820 msg = self._html_search_regex(
1821 r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
1822 video_webpage, 'unavailable %s' % kind, default=None)
1823 if msg:
1824 messages.append(msg)
1825 if messages:
1826 return '\n'.join(messages)
1827
1828 if not video_info and not player_response:
1829 unavailable_message = extract_unavailable_message()
1830 if not unavailable_message:
1831 unavailable_message = 'Unable to extract video data'
1832 raise ExtractorError(
1833 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
1834
1835 if not isinstance(video_info, dict):
1836 video_info = {}
1837
1838 video_details = try_get(
1839 player_response, lambda x: x['videoDetails'], dict) or {}
1840
1841 microformat = try_get(
1842 player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}
1843
1844 video_title = video_info.get('title', [None])[0] or video_details.get('title')
1845 if not video_title:
1846 self._downloader.report_warning('Unable to extract video title')
1847 video_title = '_'
1848
1849 description_original = video_description = get_element_by_id("eow-description", video_webpage)
1850 if video_description:
1851
1852 def replace_url(m):
1853 redir_url = compat_urlparse.urljoin(url, m.group(1))
1854 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1855 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1856 qs = compat_parse_qs(parsed_redir_url.query)
1857 q = qs.get('q')
1858 if q and q[0]:
1859 return q[0]
1860 return redir_url
1861
1862 description_original = video_description = re.sub(r'''(?x)
1863 <a\s+
1864 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1865 (?:title|href)="([^"]+)"\s+
1866 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1867 class="[^"]*"[^>]*>
1868 [^<]+\.{3}\s*
1869 </a>
1870 ''', replace_url, video_description)
1871 video_description = clean_html(video_description)
1872 else:
1873 video_description = video_details.get('shortDescription')
1874 if video_description is None:
1875 video_description = self._html_search_meta('description', video_webpage)
1876
1877 if not smuggled_data.get('force_singlefeed', False):
1878 if not self._downloader.params.get('noplaylist'):
1879 multifeed_metadata_list = try_get(
1880 player_response,
1881 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
1882 compat_str) or try_get(
1883 video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
1884 if multifeed_metadata_list:
1885 entries = []
1886 feed_ids = []
1887 for feed in multifeed_metadata_list.split(','):
1888 # Unquote should take place before split on comma (,) since textual
1889 # fields may contain comma as well (see
1890 # https://github.com/ytdl-org/youtube-dl/issues/8536)
1891 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
1892
1893 def feed_entry(name):
1894 return try_get(feed_data, lambda x: x[name][0], compat_str)
1895
1896 feed_id = feed_entry('id')
1897 if not feed_id:
1898 continue
1899 feed_title = feed_entry('title')
1900 title = video_title
1901 if feed_title:
1902 title += ' (%s)' % feed_title
1903 entries.append({
1904 '_type': 'url_transparent',
1905 'ie_key': 'Youtube',
1906 'url': smuggle_url(
1907 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1908 {'force_singlefeed': True}),
1909 'title': title,
1910 })
1911 feed_ids.append(feed_id)
1912 self.to_screen(
1913 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1914 % (', '.join(feed_ids), video_id))
1915 return self.playlist_result(entries, video_id, video_title, video_description)
1916 else:
1917 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1918
1919 if view_count is None:
1920 view_count = extract_view_count(video_info)
1921 if view_count is None and video_details:
1922 view_count = int_or_none(video_details.get('viewCount'))
1923 if view_count is None and microformat:
1924 view_count = int_or_none(microformat.get('viewCount'))
1925
1926 if is_live is None:
1927 is_live = bool_or_none(video_details.get('isLive'))
1928
1929 has_live_chat_replay = False
1930 if not is_live:
1931 yt_initial_data = self._get_yt_initial_data(video_id, video_webpage)
1932 try:
1933 yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
1934 has_live_chat_replay = True
1935 except (KeyError, IndexError, TypeError):
1936 pass
1937
1938 # Check for "rental" videos
1939 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1940 raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
1941
1942 def _extract_filesize(media_url):
1943 return int_or_none(self._search_regex(
1944 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
1945
1946 streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
1947 streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
1948
1949 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1950 self.report_rtmp_download()
1951 formats = [{
1952 'format_id': '_rtmp',
1953 'protocol': 'rtmp',
1954 'url': video_info['conn'][0],
1955 'player_url': player_url,
1956 }]
1957 elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
1958 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
1959 if 'rtmpe%3Dyes' in encoded_url_map:
1960 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
1961 formats = []
1962 formats_spec = {}
1963 fmt_list = video_info.get('fmt_list', [''])[0]
1964 if fmt_list:
1965 for fmt in fmt_list.split(','):
1966 spec = fmt.split('/')
1967 if len(spec) > 1:
1968 width_height = spec[1].split('x')
1969 if len(width_height) == 2:
1970 formats_spec[spec[0]] = {
1971 'resolution': spec[1],
1972 'width': int_or_none(width_height[0]),
1973 'height': int_or_none(width_height[1]),
1974 }
1975 for fmt in streaming_formats:
1976 itag = str_or_none(fmt.get('itag'))
1977 if not itag:
1978 continue
1979 quality = fmt.get('quality')
1980 quality_label = fmt.get('qualityLabel') or quality
1981 formats_spec[itag] = {
1982 'asr': int_or_none(fmt.get('audioSampleRate')),
1983 'filesize': int_or_none(fmt.get('contentLength')),
1984 'format_note': quality_label,
1985 'fps': int_or_none(fmt.get('fps')),
1986 'height': int_or_none(fmt.get('height')),
1987 # bitrate for itag 43 is always 2147483647
1988 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
1989 'width': int_or_none(fmt.get('width')),
1990 }
1991
1992 for fmt in streaming_formats:
1993 if fmt.get('drmFamilies') or fmt.get('drm_families'):
1994 continue
1995 url = url_or_none(fmt.get('url'))
1996
1997 if not url:
1998 cipher = fmt.get('cipher') or fmt.get('signatureCipher')
1999 if not cipher:
2000 continue
2001 url_data = compat_parse_qs(cipher)
2002 url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
2003 if not url:
2004 continue
2005 else:
2006 cipher = None
2007 url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2008
2009 stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
2010 # Unsupported FORMAT_STREAM_TYPE_OTF
2011 if stream_type == 3:
2012 continue
2013
2014 format_id = fmt.get('itag') or url_data['itag'][0]
2015 if not format_id:
2016 continue
2017 format_id = compat_str(format_id)
2018
2019 if cipher:
2020 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
2021 ASSETS_RE = (
2022 r'<script[^>]+\bsrc=("[^"]+")[^>]+\bname=["\']player_ias/base',
2023 r'"jsUrl"\s*:\s*("[^"]+")',
2024 r'"assets":.+?"js":\s*("[^"]+")')
2025 jsplayer_url_json = self._search_regex(
2026 ASSETS_RE,
2027 embed_webpage if age_gate else video_webpage,
2028 'JS player URL (1)', default=None)
2029 if not jsplayer_url_json and not age_gate:
2030 # We need the embed website after all
2031 if embed_webpage is None:
2032 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
2033 embed_webpage = self._download_webpage(
2034 embed_url, video_id, 'Downloading embed webpage')
2035 jsplayer_url_json = self._search_regex(
2036 ASSETS_RE, embed_webpage, 'JS player URL')
2037
2038 player_url = json.loads(jsplayer_url_json)
2039 if player_url is None:
2040 player_url_json = self._search_regex(
2041 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
2042 video_webpage, 'age gate player URL')
2043 player_url = json.loads(player_url_json)
2044
2045 if 'sig' in url_data:
2046 url += '&signature=' + url_data['sig'][0]
2047 elif 's' in url_data:
2048 encrypted_sig = url_data['s'][0]
2049
2050 if self._downloader.params.get('verbose'):
2051 if player_url is None:
2052 player_desc = 'unknown'
2053 else:
2054 player_type, player_version = self._extract_player_info(player_url)
2055 player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version)
2056 parts_sizes = self._signature_cache_id(encrypted_sig)
2057 self.to_screen('{%s} signature length %s, %s' %
2058 (format_id, parts_sizes, player_desc))
2059
2060 signature = self._decrypt_signature(
2061 encrypted_sig, video_id, player_url, age_gate)
2062 sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
2063 url += '&%s=%s' % (sp, signature)
2064 if 'ratebypass' not in url:
2065 url += '&ratebypass=yes'
2066
2067 dct = {
2068 'format_id': format_id,
2069 'url': url,
2070 'player_url': player_url,
2071 }
2072 if format_id in self._formats:
2073 dct.update(self._formats[format_id])
2074 if format_id in formats_spec:
2075 dct.update(formats_spec[format_id])
2076
2077 # Some itags are not included in DASH manifest thus corresponding formats will
2078 # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).
2079 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
2080 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
2081 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
2082
2083 if width is None:
2084 width = int_or_none(fmt.get('width'))
2085 if height is None:
2086 height = int_or_none(fmt.get('height'))
2087
2088 filesize = int_or_none(url_data.get(
2089 'clen', [None])[0]) or _extract_filesize(url)
2090
2091 quality = url_data.get('quality', [None])[0] or fmt.get('quality')
2092 quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
2093
2094 tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
2095 or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
2096 fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
2097
2098 more_fields = {
2099 'filesize': filesize,
2100 'tbr': tbr,
2101 'width': width,
2102 'height': height,
2103 'fps': fps,
2104 'format_note': quality_label or quality,
2105 }
2106 for key, value in more_fields.items():
2107 if value:
2108 dct[key] = value
2109 type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
2110 if type_:
2111 type_split = type_.split(';')
2112 kind_ext = type_split[0].split('/')
2113 if len(kind_ext) == 2:
2114 kind, _ = kind_ext
2115 dct['ext'] = mimetype2ext(type_split[0])
2116 if kind in ('audio', 'video'):
2117 codecs = None
2118 for mobj in re.finditer(
2119 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
2120 if mobj.group('key') == 'codecs':
2121 codecs = mobj.group('val')
2122 break
2123 if codecs:
2124 dct.update(parse_codecs(codecs))
2125 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
2126 dct['downloader_options'] = {
2127 # Youtube throttles chunks >~10M
2128 'http_chunk_size': 10485760,
2129 }
2130 formats.append(dct)
2131 else:
2132 manifest_url = (
2133 url_or_none(try_get(
2134 player_response,
2135 lambda x: x['streamingData']['hlsManifestUrl'],
2136 compat_str))
2137 or url_or_none(try_get(
2138 video_info, lambda x: x['hlsvp'][0], compat_str)))
2139 if manifest_url:
2140 formats = []
2141 m3u8_formats = self._extract_m3u8_formats(
2142 manifest_url, video_id, 'mp4', fatal=False)
2143 for a_format in m3u8_formats:
2144 itag = self._search_regex(
2145 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
2146 if itag:
2147 a_format['format_id'] = itag
2148 if itag in self._formats:
2149 dct = self._formats[itag].copy()
2150 dct.update(a_format)
2151 a_format = dct
2152 a_format['player_url'] = player_url
2153 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
2154 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
2155 if self._downloader.params.get('youtube_include_hls_manifest', True):
2156 formats.append(a_format)
2157 else:
2158 error_message = extract_unavailable_message()
2159 if not error_message:
2160 reason_list = try_get(
2161 player_response,
2162 lambda x: x['playabilityStatus']['errorScreen']['playerErrorMessageRenderer']['subreason']['runs'],
2163 list) or []
2164 for reason in reason_list:
2165 if not isinstance(reason, dict):
2166 continue
2167 reason_text = try_get(reason, lambda x: x['text'], compat_str)
2168 if reason_text:
2169 if not error_message:
2170 error_message = ''
2171 error_message += reason_text
2172 if error_message:
2173 error_message = clean_html(error_message)
2174 if not error_message:
2175 error_message = clean_html(try_get(
2176 player_response, lambda x: x['playabilityStatus']['reason'],
2177 compat_str))
2178 if not error_message:
2179 error_message = clean_html(
2180 try_get(video_info, lambda x: x['reason'][0], compat_str))
2181 if error_message:
2182 raise ExtractorError(error_message, expected=True)
2183 raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
2184
2185 # uploader
2186 video_uploader = try_get(
2187 video_info, lambda x: x['author'][0],
2188 compat_str) or str_or_none(video_details.get('author'))
2189 if video_uploader:
2190 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
2191 else:
2192 self._downloader.report_warning('unable to extract uploader name')
2193
2194 # uploader_id
2195 video_uploader_id = None
2196 video_uploader_url = None
2197 mobj = re.search(
2198 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
2199 video_webpage)
2200 if mobj is not None:
2201 video_uploader_id = mobj.group('uploader_id')
2202 video_uploader_url = mobj.group('uploader_url')
2203 else:
2204 owner_profile_url = url_or_none(microformat.get('ownerProfileUrl'))
2205 if owner_profile_url:
2206 video_uploader_id = self._search_regex(
2207 r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id',
2208 default=None)
2209 video_uploader_url = owner_profile_url
2210
2211 channel_id = (
2212 str_or_none(video_details.get('channelId'))
2213 or self._html_search_meta(
2214 'channelId', video_webpage, 'channel id', default=None)
2215 or self._search_regex(
2216 r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
2217 video_webpage, 'channel id', default=None, group='id'))
2218 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
2219
2220 thumbnails = []
2221 thumbnails_list = try_get(
2222 video_details, lambda x: x['thumbnail']['thumbnails'], list) or []
2223 for t in thumbnails_list:
2224 if not isinstance(t, dict):
2225 continue
2226 thumbnail_url = url_or_none(t.get('url'))
2227 if not thumbnail_url:
2228 continue
2229 thumbnails.append({
2230 'url': thumbnail_url,
2231 'width': int_or_none(t.get('width')),
2232 'height': int_or_none(t.get('height')),
2233 })
2234
2235 if not thumbnails:
2236 video_thumbnail = None
2237 # We try first to get a high quality image:
2238 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
2239 video_webpage, re.DOTALL)
2240 if m_thumb is not None:
2241 video_thumbnail = m_thumb.group(1)
2242 thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str)
2243 if thumbnail_url:
2244 video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url)
2245 if video_thumbnail:
2246 thumbnails.append({'url': video_thumbnail})
2247
2248 # upload date
2249 upload_date = self._html_search_meta(
2250 'datePublished', video_webpage, 'upload date', default=None)
2251 if not upload_date:
2252 upload_date = self._search_regex(
2253 [r'(?s)id="eow-date.*?>(.*?)</span>',
2254 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
2255 video_webpage, 'upload date', default=None)
2256 if not upload_date:
2257 upload_date = microformat.get('publishDate') or microformat.get('uploadDate')
2258 upload_date = unified_strdate(upload_date)
2259
2260 video_license = self._html_search_regex(
2261 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
2262 video_webpage, 'license', default=None)
2263
2264 m_music = re.search(
2265 r'''(?x)
2266 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
2267 <ul[^>]*>\s*
2268 <li>(?P<title>.+?)
2269 by (?P<creator>.+?)
2270 (?:
2271 \(.+?\)|
2272 <a[^>]*
2273 (?:
2274 \bhref=["\']/red[^>]*>| # drop possible
2275 >\s*Listen ad-free with YouTube Red # YouTube Red ad
2276 )
2277 .*?
2278 )?</li
2279 ''',
2280 video_webpage)
2281 if m_music:
2282 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
2283 video_creator = clean_html(m_music.group('creator'))
2284 else:
2285 video_alt_title = video_creator = None
2286
2287 def extract_meta(field):
2288 return self._html_search_regex(
2289 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
2290 video_webpage, field, default=None)
2291
2292 track = extract_meta('Song')
2293 artist = extract_meta('Artist')
2294 album = extract_meta('Album')
2295
2296 # Youtube Music Auto-generated description
2297 release_date = release_year = None
2298 if video_description:
2299 mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description)
2300 if mobj:
2301 if not track:
2302 track = mobj.group('track').strip()
2303 if not artist:
2304 artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))
2305 if not album:
2306 album = mobj.group('album'.strip())
2307 release_year = mobj.group('release_year')
2308 release_date = mobj.group('release_date')
2309 if release_date:
2310 release_date = release_date.replace('-', '')
2311 if not release_year:
2312 release_year = int(release_date[:4])
2313 if release_year:
2314 release_year = int(release_year)
2315
2316 yt_initial = self._get_yt_initial_data(video_id, video_webpage)
2317 if yt_initial:
2318 music_metadata = self._get_music_metadata_from_yt_initial(yt_initial)
2319 if len(music_metadata):
2320 album = music_metadata[0].get('album')
2321 artist = music_metadata[0].get('artist')
2322 track = music_metadata[0].get('track')
2323
2324 m_episode = re.search(
2325 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
2326 video_webpage)
2327 if m_episode:
2328 series = unescapeHTML(m_episode.group('series'))
2329 season_number = int(m_episode.group('season'))
2330 episode_number = int(m_episode.group('episode'))
2331 else:
2332 series = season_number = episode_number = None
2333
2334 m_cat_container = self._search_regex(
2335 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
2336 video_webpage, 'categories', default=None)
2337 category = None
2338 if m_cat_container:
2339 category = self._html_search_regex(
2340 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
2341 default=None)
2342 if not category:
2343 category = try_get(
2344 microformat, lambda x: x['category'], compat_str)
2345 video_categories = None if category is None else [category]
2346
2347 video_tags = [
2348 unescapeHTML(m.group('content'))
2349 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
2350 if not video_tags:
2351 video_tags = try_get(video_details, lambda x: x['keywords'], list)
2352
2353 def _extract_count(count_name):
2354 return str_to_int(self._search_regex(
2355 (r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' % re.escape(count_name),
2356 r'["\']label["\']\s*:\s*["\']([\d,.]+)\s+%ss["\']' % re.escape(count_name)),
2357 video_webpage, count_name, default=None))
2358
2359 like_count = _extract_count('like')
2360 dislike_count = _extract_count('dislike')
2361
2362 if view_count is None:
2363 view_count = str_to_int(self._search_regex(
2364 r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
2365 'view count', default=None))
2366
2367 average_rating = (
2368 float_or_none(video_details.get('averageRating'))
2369 or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
2370
2371 # subtitles
2372 video_subtitles = self.extract_subtitles(
2373 video_id, video_webpage, has_live_chat_replay)
2374 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
2375
2376 video_duration = try_get(
2377 video_info, lambda x: int_or_none(x['length_seconds'][0]))
2378 if not video_duration:
2379 video_duration = int_or_none(video_details.get('lengthSeconds'))
2380 if not video_duration:
2381 video_duration = parse_duration(self._html_search_meta(
2382 'duration', video_webpage, 'video duration'))
2383
2384 # Get Subscriber Count of channel
2385 subscriber_count = parse_count(self._search_regex(
2386 r'"text":"([\d\.]+\w?) subscribers"',
2387 video_webpage,
2388 'subscriber count',
2389 default=None
2390 ))
2391
2392 # annotations
2393 video_annotations = None
2394 if self._downloader.params.get('writeannotations', False):
2395 xsrf_token = self._search_regex(
2396 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2',
2397 video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
2398 invideo_url = try_get(
2399 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
2400 if xsrf_token and invideo_url:
2401 xsrf_field_name = self._search_regex(
2402 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
2403 video_webpage, 'xsrf field name',
2404 group='xsrf_field_name', default='session_token')
2405 video_annotations = self._download_webpage(
2406 self._proto_relative_url(invideo_url),
2407 video_id, note='Downloading annotations',
2408 errnote='Unable to download video annotations', fatal=False,
2409 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
2410
2411 chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
2412
2413 # Look for the DASH manifest
2414 if self._downloader.params.get('youtube_include_dash_manifest', True):
2415 dash_mpd_fatal = True
2416 for mpd_url in dash_mpds:
2417 dash_formats = {}
2418 try:
2419 def decrypt_sig(mobj):
2420 s = mobj.group(1)
2421 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2422 return '/signature/%s' % dec_s
2423
2424 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2425
2426 for df in self._extract_mpd_formats(
2427 mpd_url, video_id, fatal=dash_mpd_fatal,
2428 formats_dict=self._formats):
2429 if not df.get('filesize'):
2430 df['filesize'] = _extract_filesize(df['url'])
2431 # Do not overwrite DASH format found in some previous DASH manifest
2432 if df['format_id'] not in dash_formats:
2433 dash_formats[df['format_id']] = df
2434 # Additional DASH manifests may end up in HTTP Error 403 therefore
2435 # allow them to fail without bug report message if we already have
2436 # some DASH manifest succeeded. This is temporary workaround to reduce
2437 # burst of bug reports until we figure out the reason and whether it
2438 # can be fixed at all.
2439 dash_mpd_fatal = False
2440 except (ExtractorError, KeyError) as e:
2441 self.report_warning(
2442 'Skipping DASH manifest: %r' % e, video_id)
2443 if dash_formats:
2444 # Remove the formats we found through non-DASH, they
2445 # contain less info and it can be wrong, because we use
2446 # fixed values (for example the resolution). See
2447 # https://github.com/ytdl-org/youtube-dl/issues/5774 for an
2448 # example.
2449 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
2450 formats.extend(dash_formats.values())
2451
2452 # Check for malformed aspect ratio
2453 stretched_m = re.search(
2454 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2455 video_webpage)
2456 if stretched_m:
2457 w = float(stretched_m.group('w'))
2458 h = float(stretched_m.group('h'))
2459 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2460 # We will only process correct ratios.
2461 if w > 0 and h > 0:
2462 ratio = w / h
2463 for f in formats:
2464 if f.get('vcodec') != 'none':
2465 f['stretched_ratio'] = ratio
2466
2467 if not formats:
2468 if 'reason' in video_info:
2469 if 'The uploader has not made this video available in your country.' in video_info['reason']:
2470 regions_allowed = self._html_search_meta(
2471 'regionsAllowed', video_webpage, default=None)
2472 countries = regions_allowed.split(',') if regions_allowed else None
2473 self.raise_geo_restricted(
2474 msg=video_info['reason'][0], countries=countries)
2475 reason = video_info['reason'][0]
2476 if 'Invalid parameters' in reason:
2477 unavailable_message = extract_unavailable_message()
2478 if unavailable_message:
2479 reason = unavailable_message
2480 raise ExtractorError(
2481 'YouTube said: %s' % reason,
2482 expected=True, video_id=video_id)
2483 if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']):
2484 raise ExtractorError('This video is DRM protected.', expected=True)
2485
2486 self._sort_formats(formats)
2487
2488 self.mark_watched(video_id, video_info, player_response)
2489
2490 return {
2491 'id': video_id,
2492 'uploader': video_uploader,
2493 'uploader_id': video_uploader_id,
2494 'uploader_url': video_uploader_url,
2495 'channel_id': channel_id,
2496 'channel_url': channel_url,
2497 'upload_date': upload_date,
2498 'license': video_license,
2499 'creator': video_creator or artist,
2500 'title': video_title,
2501 'alt_title': video_alt_title or track,
2502 'thumbnails': thumbnails,
2503 'description': video_description,
2504 'categories': video_categories,
2505 'tags': video_tags,
2506 'subtitles': video_subtitles,
2507 'automatic_captions': automatic_captions,
2508 'duration': video_duration,
2509 'age_limit': 18 if age_gate else 0,
2510 'annotations': video_annotations,
2511 'chapters': chapters,
2512 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
2513 'view_count': view_count,
2514 'like_count': like_count,
2515 'dislike_count': dislike_count,
2516 'average_rating': average_rating,
2517 'formats': formats,
2518 'is_live': is_live,
2519 'start_time': start_time,
2520 'end_time': end_time,
2521 'series': series,
2522 'season_number': season_number,
2523 'episode_number': episode_number,
2524 'track': track,
2525 'artist': artist,
2526 'album': album,
2527 'release_date': release_date,
2528 'release_year': release_year,
2529 'subscriber_count': subscriber_count,
2530 }
2531
2532
2533 class YoutubeTabIE(YoutubeBaseInfoExtractor):
2534 IE_DESC = 'YouTube.com tab'
2535 _VALID_URL = (r'''(?x)
2536 https?://(?:\w+\.)?(?:youtube(?:kids)?\.com|invidio\.us)/(?:
2537 (?!(%s)([/#?]|$))|channel/|c/|user/|
2538 (?P<not_channel>playlist|watch)/?\?.*?\blist=)
2539 (?P<id>[^/?#&]+)''') % YoutubeBaseInfoExtractor._RESERVED_NAMES
2540 IE_NAME = 'youtube:tab'
2541
2542 _TESTS = [{
2543 # playlists, multipage
2544 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid',
2545 'playlist_mincount': 94,
2546 'info_dict': {
2547 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2548 'title': 'Игорь Клейнер - Playlists',
2549 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
2550 },
2551 }, {
2552 # playlists, multipage, different order
2553 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
2554 'playlist_mincount': 94,
2555 'info_dict': {
2556 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2557 'title': 'Игорь Клейнер - Playlists',
2558 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
2559 },
2560 }, {
2561 # playlists, singlepage
2562 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
2563 'playlist_mincount': 4,
2564 'info_dict': {
2565 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
2566 'title': 'ThirstForScience - Playlists',
2567 'description': 'md5:609399d937ea957b0f53cbffb747a14c',
2568 }
2569 }, {
2570 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
2571 'only_matching': True,
2572 }, {
2573 # basic, single video playlist
2574 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2575 'info_dict': {
2576 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2577 'uploader': 'Sergey M.',
2578 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2579 'title': 'youtube-dl public playlist',
2580 },
2581 'playlist_count': 1,
2582 }, {
2583 # empty playlist
2584 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2585 'info_dict': {
2586 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2587 'uploader': 'Sergey M.',
2588 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2589 'title': 'youtube-dl empty playlist',
2590 },
2591 'playlist_count': 0,
2592 }, {
2593 # Home tab
2594 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured',
2595 'info_dict': {
2596 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2597 'title': 'lex will - Home',
2598 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2599 },
2600 'playlist_mincount': 2,
2601 }, {
2602 # Videos tab
2603 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos',
2604 'info_dict': {
2605 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2606 'title': 'lex will - Videos',
2607 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2608 },
2609 'playlist_mincount': 975,
2610 }, {
2611 # Videos tab, sorted by popular
2612 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid',
2613 'info_dict': {
2614 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2615 'title': 'lex will - Videos',
2616 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2617 },
2618 'playlist_mincount': 199,
2619 }, {
2620 # Playlists tab
2621 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists',
2622 'info_dict': {
2623 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2624 'title': 'lex will - Playlists',
2625 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2626 },
2627 'playlist_mincount': 17,
2628 }, {
2629 # Community tab
2630 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community',
2631 'info_dict': {
2632 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2633 'title': 'lex will - Community',
2634 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2635 },
2636 'playlist_mincount': 18,
2637 }, {
2638 # Channels tab
2639 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels',
2640 'info_dict': {
2641 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2642 'title': 'lex will - Channels',
2643 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2644 },
2645 'playlist_mincount': 138,
2646 }, {
2647 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA',
2648 'only_matching': True,
2649 }, {
2650 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
2651 'only_matching': True,
2652 }, {
2653 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
2654 'only_matching': True,
2655 }, {
2656 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2657 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2658 'info_dict': {
2659 'title': '29C3: Not my department',
2660 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2661 'uploader': 'Christiaan008',
2662 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
2663 },
2664 'playlist_count': 96,
2665 }, {
2666 'note': 'Large playlist',
2667 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
2668 'info_dict': {
2669 'title': 'Uploads from Cauchemar',
2670 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
2671 'uploader': 'Cauchemar',
2672 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
2673 },
2674 'playlist_mincount': 1123,
2675 }, {
2676 # even larger playlist, 8832 videos
2677 'url': 'http://www.youtube.com/user/NASAgovVideo/videos',
2678 'only_matching': True,
2679 }, {
2680 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2681 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2682 'info_dict': {
2683 'title': 'Uploads from Interstellar Movie',
2684 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
2685 'uploader': 'Interstellar Movie',
2686 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
2687 },
2688 'playlist_mincount': 21,
2689 }, {
2690 # https://github.com/ytdl-org/youtube-dl/issues/21844
2691 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2692 'info_dict': {
2693 'title': 'Data Analysis with Dr Mike Pound',
2694 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2695 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
2696 'uploader': 'Computerphile',
2697 },
2698 'playlist_mincount': 11,
2699 }, {
2700 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2701 'only_matching': True,
2702 }, {
2703 # Playlist URL that does not actually serve a playlist
2704 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2705 'info_dict': {
2706 'id': 'FqZTN594JQw',
2707 'ext': 'webm',
2708 'title': "Smiley's People 01 detective, Adventure Series, Action",
2709 'uploader': 'STREEM',
2710 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
2711 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
2712 'upload_date': '20150526',
2713 'license': 'Standard YouTube License',
2714 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2715 'categories': ['People & Blogs'],
2716 'tags': list,
2717 'view_count': int,
2718 'like_count': int,
2719 'dislike_count': int,
2720 },
2721 'params': {
2722 'skip_download': True,
2723 },
2724 'skip': 'This video is not available.',
2725 'add_ie': [YoutubeIE.ie_key()],
2726 }, {
2727 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
2728 'only_matching': True,
2729 }, {
2730 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
2731 'only_matching': True,
2732 }, {
2733 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live',
2734 'info_dict': {
2735 'id': '9Auq9mYxFEE',
2736 'ext': 'mp4',
2737 'title': 'Watch Sky News live',
2738 'uploader': 'Sky News',
2739 'uploader_id': 'skynews',
2740 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews',
2741 'upload_date': '20191102',
2742 'description': 'md5:78de4e1c2359d0ea3ed829678e38b662',
2743 'categories': ['News & Politics'],
2744 'tags': list,
2745 'like_count': int,
2746 'dislike_count': int,
2747 },
2748 'params': {
2749 'skip_download': True,
2750 },
2751 }, {
2752 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
2753 'info_dict': {
2754 'id': 'a48o2S1cPoo',
2755 'ext': 'mp4',
2756 'title': 'The Young Turks - Live Main Show',
2757 'uploader': 'The Young Turks',
2758 'uploader_id': 'TheYoungTurks',
2759 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
2760 'upload_date': '20150715',
2761 'license': 'Standard YouTube License',
2762 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
2763 'categories': ['News & Politics'],
2764 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
2765 'like_count': int,
2766 'dislike_count': int,
2767 },
2768 'params': {
2769 'skip_download': True,
2770 },
2771 'only_matching': True,
2772 }, {
2773 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
2774 'only_matching': True,
2775 }, {
2776 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
2777 'only_matching': True,
2778 },
2779 # TODO
2780 # {
2781 # 'url': 'https://www.youtube.com/TheYoungTurks/live',
2782 # 'only_matching': True,
2783 # }
2784 ]
2785
2786 def _extract_channel_id(self, webpage):
2787 channel_id = self._html_search_meta(
2788 'channelId', webpage, 'channel id', default=None)
2789 if channel_id:
2790 return channel_id
2791 channel_url = self._html_search_meta(
2792 ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url',
2793 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad',
2794 'twitter:app:url:googleplay'), webpage, 'channel url')
2795 return self._search_regex(
2796 r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+',
2797 channel_url, 'channel id')
2798
2799 @staticmethod
2800 def _extract_grid_item_renderer(item):
2801 for item_kind in ('Playlist', 'Video', 'Channel'):
2802 renderer = item.get('grid%sRenderer' % item_kind)
2803 if renderer:
2804 return renderer
2805
2806 def _extract_video(self, renderer):
2807 video_id = renderer.get('videoId')
2808 title = try_get(
2809 renderer,
2810 (lambda x: x['title']['runs'][0]['text'],
2811 lambda x: x['title']['simpleText']), compat_str)
2812 description = try_get(
2813 renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'],
2814 compat_str)
2815 duration = parse_duration(try_get(
2816 renderer, lambda x: x['lengthText']['simpleText'], compat_str))
2817 view_count_text = try_get(
2818 renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
2819 view_count = str_to_int(self._search_regex(
2820 r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
2821 'view count', default=None))
2822 uploader = try_get(
2823 renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
2824 return {
2825 '_type': 'url_transparent',
2826 'ie_key': YoutubeIE.ie_key(),
2827 'id': video_id,
2828 'url': video_id,
2829 'title': title,
2830 'description': description,
2831 'duration': duration,
2832 'view_count': view_count,
2833 'uploader': uploader,
2834 }
2835
2836 def _grid_entries(self, grid_renderer):
2837 for item in grid_renderer['items']:
2838 if not isinstance(item, dict):
2839 continue
2840 renderer = self._extract_grid_item_renderer(item)
2841 if not isinstance(renderer, dict):
2842 continue
2843 title = try_get(
2844 renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
2845 # playlist
2846 playlist_id = renderer.get('playlistId')
2847 if playlist_id:
2848 yield self.url_result(
2849 'https://www.youtube.com/playlist?list=%s' % playlist_id,
2850 ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
2851 video_title=title)
2852 # video
2853 video_id = renderer.get('videoId')
2854 if video_id:
2855 yield self._extract_video(renderer)
2856 # channel
2857 channel_id = renderer.get('channelId')
2858 if channel_id:
2859 title = try_get(
2860 renderer, lambda x: x['title']['simpleText'], compat_str)
2861 yield self.url_result(
2862 'https://www.youtube.com/channel/%s' % channel_id,
2863 ie=YoutubeTabIE.ie_key(), video_title=title)
2864
2865 def _shelf_entries_trimmed(self, shelf_renderer):
2866 renderer = try_get(
2867 shelf_renderer, lambda x: x['content']['horizontalListRenderer'], dict)
2868 if not renderer:
2869 return
2870 # TODO: add support for nested playlists so each shelf is processed
2871 # as separate playlist
2872 # TODO: this includes only first N items
2873 for entry in self._grid_entries(renderer):
2874 yield entry
2875
2876 def _shelf_entries(self, shelf_renderer):
2877 ep = try_get(
2878 shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
2879 compat_str)
2880 shelf_url = urljoin('https://www.youtube.com', ep)
2881 if not shelf_url:
2882 return
2883 title = try_get(
2884 shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
2885 yield self.url_result(shelf_url, video_title=title)
2886
2887 def _playlist_entries(self, video_list_renderer):
2888 for content in video_list_renderer['contents']:
2889 if not isinstance(content, dict):
2890 continue
2891 renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer')
2892 if not isinstance(renderer, dict):
2893 continue
2894 video_id = renderer.get('videoId')
2895 if not video_id:
2896 continue
2897 yield self._extract_video(renderer)
2898
2899 def _itemSection_entries(self, item_sect_renderer):
2900 for content in item_sect_renderer['contents']:
2901 if not isinstance(content, dict):
2902 continue
2903 renderer = content.get('videoRenderer', {})
2904 if not isinstance(renderer, dict):
2905 continue
2906 video_id = renderer.get('videoId')
2907 if not video_id:
2908 continue
2909 yield self._extract_video(renderer)
2910
2911 def _rich_entries(self, rich_grid_renderer):
2912 renderer = try_get(
2913 rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict)
2914 video_id = renderer.get('videoId')
2915 if not video_id:
2916 return
2917 yield self._extract_video(renderer)
2918
2919 def _video_entry(self, video_renderer):
2920 video_id = video_renderer.get('videoId')
2921 if video_id:
2922 return self._extract_video(video_renderer)
2923
2924 def _post_thread_entries(self, post_thread_renderer):
2925 post_renderer = try_get(
2926 post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict)
2927 if not post_renderer:
2928 return
2929 # video attachment
2930 video_renderer = try_get(
2931 post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict)
2932 video_id = None
2933 if video_renderer:
2934 entry = self._video_entry(video_renderer)
2935 if entry:
2936 yield entry
2937 # inline video links
2938 runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or []
2939 for run in runs:
2940 if not isinstance(run, dict):
2941 continue
2942 ep_url = try_get(
2943 run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str)
2944 if not ep_url:
2945 continue
2946 if not YoutubeIE.suitable(ep_url):
2947 continue
2948 ep_video_id = YoutubeIE._match_id(ep_url)
2949 if video_id == ep_video_id:
2950 continue
2951 yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=video_id)
2952
2953 def _post_thread_continuation_entries(self, post_thread_continuation):
2954 contents = post_thread_continuation.get('contents')
2955 if not isinstance(contents, list):
2956 return
2957 for content in contents:
2958 renderer = content.get('backstagePostThreadRenderer')
2959 if not isinstance(renderer, dict):
2960 continue
2961 for entry in self._post_thread_entries(renderer):
2962 yield entry
2963
2964 @staticmethod
2965 def _extract_next_continuation_data(renderer):
2966 next_continuation = try_get(
2967 renderer, lambda x: x['continuations'][0]['nextContinuationData'], dict)
2968 if not next_continuation:
2969 return
2970 continuation = next_continuation.get('continuation')
2971 if not continuation:
2972 return
2973 ctp = next_continuation.get('clickTrackingParams')
2974 return {
2975 'ctoken': continuation,
2976 'continuation': continuation,
2977 'itct': ctp,
2978 }
2979
2980 @classmethod
2981 def _extract_continuation(cls, renderer):
2982 next_continuation = cls._extract_next_continuation_data(renderer)
2983 if next_continuation:
2984 return next_continuation
2985 contents = renderer.get('contents')
2986 if not isinstance(contents, list):
2987 return
2988 for content in contents:
2989 if not isinstance(content, dict):
2990 continue
2991 continuation_ep = try_get(
2992 content, lambda x: x['continuationItemRenderer']['continuationEndpoint'],
2993 dict)
2994 if not continuation_ep:
2995 continue
2996 continuation = try_get(
2997 continuation_ep, lambda x: x['continuationCommand']['token'], compat_str)
2998 if not continuation:
2999 continue
3000 ctp = continuation_ep.get('clickTrackingParams')
3001 if not ctp:
3002 continue
3003 return {
3004 'ctoken': continuation,
3005 'continuation': continuation,
3006 'itct': ctp,
3007 }
3008
3009 def _entries(self, tab, identity_token):
3010
3011 def extract_entries(parent_renderer):
3012 slr_contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
3013 for slr_content in slr_contents:
3014 if not isinstance(slr_content, dict):
3015 continue
3016 is_renderer = try_get(slr_content, lambda x: x['itemSectionRenderer'], dict)
3017 if not is_renderer:
3018 renderer = slr_content.get('richItemRenderer')
3019 if renderer:
3020 for entry in self._rich_entries(renderer):
3021 yield entry
3022 continuation_list[0] = self._extract_continuation(parent_renderer)
3023 continue
3024 isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
3025 for isr_content in isr_contents:
3026 if not isinstance(isr_content, dict):
3027 continue
3028 renderer = isr_content.get('playlistVideoListRenderer')
3029 if renderer:
3030 for entry in self._playlist_entries(renderer):
3031 yield entry
3032 continuation_list[0] = self._extract_continuation(renderer)
3033 continue
3034 renderer = isr_content.get('gridRenderer')
3035 if renderer:
3036 for entry in self._grid_entries(renderer):
3037 yield entry
3038 continuation_list[0] = self._extract_continuation(renderer)
3039 continue
3040 renderer = isr_content.get('shelfRenderer')
3041 if renderer:
3042 for entry in self._shelf_entries(renderer):
3043 yield entry
3044 continuation_list[0] = self._extract_continuation(parent_renderer)
3045 continue
3046 renderer = isr_content.get('backstagePostThreadRenderer')
3047 if renderer:
3048 for entry in self._post_thread_entries(renderer):
3049 yield entry
3050 continuation_list[0] = self._extract_continuation(renderer)
3051 continue
3052 renderer = isr_content.get('videoRenderer')
3053 if renderer:
3054 entry = self._video_entry(renderer)
3055 if entry:
3056 yield entry
3057 if not continuation_list[0]:
3058 continuation_list[0] = self._extract_continuation(is_renderer)
3059 if not continuation_list[0]:
3060 continuation_list[0] = self._extract_continuation(parent_renderer)
3061
3062 continuation_list = [None] # Python 2 doesnot support nonlocal
3063 parent_renderer = (
3064 try_get(tab, lambda x: x['sectionListRenderer'], dict)
3065 or try_get(tab, lambda x: x['richGridRenderer'], dict) or {})
3066 if parent_renderer:
3067 for entry in extract_entries(parent_renderer):
3068 yield entry
3069 continuation = continuation_list[0]
3070
3071 headers = {
3072 'x-youtube-client-name': '1',
3073 'x-youtube-client-version': '2.20201112.04.01',
3074 }
3075 if identity_token:
3076 headers['x-youtube-identity-token'] = identity_token
3077
3078 for page_num in itertools.count(1):
3079 if not continuation:
3080 break
3081 if hasattr(self, '_MAX_PAGES') and page_num > self._MAX_PAGES:
3082 break
3083 browse = self._download_json(
3084 'https://www.youtube.com/browse_ajax', None,
3085 'Downloading page %d' % page_num,
3086 headers=headers, query=continuation, fatal=False)
3087 if not browse:
3088 break
3089 response = try_get(browse, lambda x: x[1]['response'], dict)
3090 if not response:
3091 break
3092
3093 continuation_contents = try_get(
3094 response, lambda x: x['continuationContents'], dict)
3095 if continuation_contents:
3096 continuation_renderer = continuation_contents.get('playlistVideoListContinuation')
3097 if continuation_renderer:
3098 for entry in self._playlist_entries(continuation_renderer):
3099 yield entry
3100 continuation = self._extract_continuation(continuation_renderer)
3101 continue
3102 continuation_renderer = continuation_contents.get('gridContinuation')
3103 if continuation_renderer:
3104 for entry in self._grid_entries(continuation_renderer):
3105 yield entry
3106 continuation = self._extract_continuation(continuation_renderer)
3107 continue
3108 continuation_renderer = continuation_contents.get('itemSectionContinuation')
3109 if continuation_renderer:
3110 for entry in self._post_thread_continuation_entries(continuation_renderer):
3111 yield entry
3112 continuation = self._extract_continuation(continuation_renderer)
3113 continue
3114 continuation_renderer = continuation_contents.get('sectionListContinuation')
3115 if continuation_renderer:
3116 continuation_list = [None]
3117 for entry in extract_entries(continuation_renderer):
3118 yield entry
3119 continuation = continuation_list[0]
3120 continue
3121
3122 continuation_items = try_get(
3123 response, lambda x: x['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems'], list)
3124 if continuation_items:
3125 continuation_item = continuation_items[0]
3126 if not isinstance(continuation_item, dict):
3127 continue
3128 renderer = continuation_item.get('playlistVideoRenderer')
3129 if renderer:
3130 video_list_renderer = {'contents': continuation_items}
3131 for entry in self._playlist_entries(video_list_renderer):
3132 yield entry
3133 continuation = self._extract_continuation(video_list_renderer)
3134 continue
3135 renderer = continuation_item.get('itemSectionRenderer')
3136 if renderer:
3137 for entry in self._itemSection_entries(renderer):
3138 yield entry
3139 continuation = self._extract_continuation({'contents': continuation_items})
3140 continue
3141 break
3142
3143 @staticmethod
3144 def _extract_selected_tab(tabs):
3145 for tab in tabs:
3146 if try_get(tab, lambda x: x['tabRenderer']['selected'], bool):
3147 return tab['tabRenderer']
3148 else:
3149 raise ExtractorError('Unable to find selected tab')
3150
3151 @staticmethod
3152 def _extract_uploader(data):
3153 uploader = {}
3154 sidebar_renderer = try_get(
3155 data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list)
3156 if sidebar_renderer:
3157 for item in sidebar_renderer:
3158 if not isinstance(item, dict):
3159 continue
3160 renderer = item.get('playlistSidebarSecondaryInfoRenderer')
3161 if not isinstance(renderer, dict):
3162 continue
3163 owner = try_get(
3164 renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict)
3165 if owner:
3166 uploader['uploader'] = owner.get('text')
3167 uploader['uploader_id'] = try_get(
3168 owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str)
3169 uploader['uploader_url'] = urljoin(
3170 'https://www.youtube.com/',
3171 try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
3172 return uploader
3173
3174 def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
3175 selected_tab = self._extract_selected_tab(tabs)
3176 renderer = try_get(
3177 data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
3178 playlist_id = None
3179 if renderer:
3180 channel_title = renderer.get('title') or item_id
3181 tab_title = selected_tab.get('title')
3182 title = channel_title or item_id
3183 if tab_title:
3184 title += ' - %s' % tab_title
3185 description = renderer.get('description')
3186 playlist_id = renderer.get('externalId')
3187 renderer = try_get(
3188 data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
3189 if renderer:
3190 title = renderer.get('title')
3191 description = None
3192 playlist_id = item_id
3193 if playlist_id is None:
3194 return None
3195 playlist = self.playlist_result(
3196 self._entries(selected_tab['content'], identity_token),
3197 playlist_id=playlist_id, playlist_title=title,
3198 playlist_description=description)
3199 playlist.update(self._extract_uploader(data))
3200 return playlist
3201
3202 def _extract_from_playlist(self, item_id, data, playlist):
3203 title = playlist.get('title') or try_get(
3204 data, lambda x: x['titleText']['simpleText'], compat_str)
3205 playlist_id = playlist.get('playlistId') or item_id
3206 return self.playlist_result(
3207 self._playlist_entries(playlist), playlist_id=playlist_id,
3208 playlist_title=title)
3209
3210 def _real_extract(self, url):
3211 item_id = self._match_id(url)
3212 url = compat_urlparse.urlunparse(
3213 compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
3214 is_home = re.match(r'(?P<pre>%s)(?P<post>/?(?![^#?]).*$)' % self._VALID_URL, url)
3215 if is_home is not None and is_home.group('not_channel') is None:
3216 self._downloader.report_warning(
3217 'A channel/user page was given. All the channel\'s videos will be downloaded. '
3218 'To download only the videos in the home page, add a "/home" to the URL')
3219 url = '%s/videos%s' % (is_home.group('pre'), is_home.group('post') or '')
3220
3221 # Handle both video/playlist URLs
3222 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3223 video_id = qs.get('v', [None])[0]
3224 playlist_id = qs.get('list', [None])[0]
3225 if video_id and playlist_id:
3226 if self._downloader.params.get('noplaylist'):
3227 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
3228 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
3229 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
3230 webpage = self._download_webpage(url, item_id)
3231 identity_token = self._search_regex(
3232 r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
3233 'identity token', default=None)
3234 data = self._extract_yt_initial_data(item_id, webpage)
3235 tabs = try_get(
3236 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
3237 if tabs:
3238 return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token)
3239 playlist = try_get(
3240 data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
3241 if playlist:
3242 return self._extract_from_playlist(item_id, data, playlist)
3243 # Fallback to video extraction if no playlist alike page is recognized.
3244 # First check for the current video then try the v attribute of URL query.
3245 video_id = try_get(
3246 data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'],
3247 compat_str) or video_id
3248 if video_id:
3249 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
3250 # Failed to recognize
3251 raise ExtractorError('Unable to recognize tab page')
3252
3253
3254 class YoutubePlaylistIE(InfoExtractor):
3255 IE_DESC = 'YouTube.com playlists'
3256 _VALID_URL = r'''(?x)(?:
3257 (?:https?://)?
3258 (?:\w+\.)?
3259 (?:
3260 (?:
3261 youtube(?:kids)?\.com|
3262 invidio\.us|
3263 youtu\.be
3264 )
3265 /.*?\?.*?\blist=
3266 )?
3267 (?P<id>%(playlist_id)s)
3268 )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
3269 IE_NAME = 'youtube:playlist'
3270 _TESTS = [{
3271 'note': 'issue #673',
3272 'url': 'PLBB231211A4F62143',
3273 'info_dict': {
3274 'title': '[OLD]Team Fortress 2 (Class-based LP)',
3275 'id': 'PLBB231211A4F62143',
3276 'uploader': 'Wickydoo',
3277 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
3278 },
3279 'playlist_mincount': 29,
3280 }, {
3281 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
3282 'info_dict': {
3283 'title': 'YDL_safe_search',
3284 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
3285 },
3286 'playlist_count': 2,
3287 'skip': 'This playlist is private',
3288 }, {
3289 'note': 'embedded',
3290 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
3291 'playlist_count': 4,
3292 'info_dict': {
3293 'title': 'JODA15',
3294 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
3295 'uploader': 'milan',
3296 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
3297 }
3298 }, {
3299 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
3300 'playlist_mincount': 982,
3301 'info_dict': {
3302 'title': '2018 Chinese New Singles (11/6 updated)',
3303 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
3304 'uploader': 'LBK',
3305 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
3306 }
3307 }, {
3308 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
3309 'info_dict': {
3310 'id': 'yeWKywCrFtk',
3311 'ext': 'mp4',
3312 'title': 'Small Scale Baler and Braiding Rugs',
3313 'uploader': 'Backus-Page House Museum',
3314 'uploader_id': 'backuspagemuseum',
3315 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
3316 'upload_date': '20161008',
3317 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
3318 'categories': ['Nonprofits & Activism'],
3319 'tags': list,
3320 'like_count': int,
3321 'dislike_count': int,
3322 },
3323 'params': {
3324 'noplaylist': True,
3325 'skip_download': True,
3326 },
3327 }, {
3328 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
3329 'only_matching': True,
3330 }, {
3331 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
3332 'only_matching': True,
3333 }, {
3334 # music album playlist
3335 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
3336 'only_matching': True,
3337 }]
3338
3339 @classmethod
3340 def suitable(cls, url):
3341 return False if YoutubeTabIE.suitable(url) else super(
3342 YoutubePlaylistIE, cls).suitable(url)
3343
3344 def _real_extract(self, url):
3345 playlist_id = self._match_id(url)
3346 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3347 if not qs:
3348 qs = {'list': playlist_id}
3349 return self.url_result(
3350 update_url_query('https://www.youtube.com/playlist', qs),
3351 ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
3352
3353
3354 class YoutubeYtUserIE(InfoExtractor):
3355 _VALID_URL = r'ytuser:(?P<id>.+)'
3356 _TESTS = [{
3357 'url': 'ytuser:phihag',
3358 'only_matching': True,
3359 }]
3360
3361 def _real_extract(self, url):
3362 user_id = self._match_id(url)
3363 return self.url_result(
3364 'https://www.youtube.com/user/%s' % user_id,
3365 ie=YoutubeTabIE.ie_key(), video_id=user_id)
3366
3367
3368 class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):
3369 IE_DESC = 'YouTube.com searches'
3370 # there doesn't appear to be a real limit, for example if you search for
3371 # 'python' you get more than 8.000.000 results
3372 _MAX_RESULTS = float('inf')
3373 IE_NAME = 'youtube:search'
3374 _SEARCH_KEY = 'ytsearch'
3375 _SEARCH_PARAMS = None
3376 _TESTS = []
3377
3378 def _entries(self, query, n):
3379 data = {
3380 'context': {
3381 'client': {
3382 'clientName': 'WEB',
3383 'clientVersion': '2.20201021.03.00',
3384 }
3385 },
3386 'query': query,
3387 }
3388 if self._SEARCH_PARAMS:
3389 data['params'] = self._SEARCH_PARAMS
3390 total = 0
3391 for page_num in itertools.count(1):
3392 search = self._download_json(
3393 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
3394 video_id='query "%s"' % query,
3395 note='Downloading page %s' % page_num,
3396 errnote='Unable to download API page', fatal=False,
3397 data=json.dumps(data).encode('utf8'),
3398 headers={'content-type': 'application/json'})
3399 if not search:
3400 break
3401 slr_contents = try_get(
3402 search,
3403 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
3404 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
3405 list)
3406 if not slr_contents:
3407 break
3408 isr_contents = try_get(
3409 slr_contents,
3410 lambda x: x[0]['itemSectionRenderer']['contents'],
3411 list)
3412 if not isr_contents:
3413 break
3414 for content in isr_contents:
3415 if not isinstance(content, dict):
3416 continue
3417 video = content.get('videoRenderer')
3418 if not isinstance(video, dict):
3419 continue
3420 video_id = video.get('videoId')
3421 if not video_id:
3422 continue
3423 title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str)
3424 description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str)
3425 duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str))
3426 view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
3427 view_count = int_or_none(self._search_regex(
3428 r'^(\d+)', re.sub(r'\s', '', view_count_text),
3429 'view count', default=None))
3430 uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
3431 total += 1
3432 yield {
3433 '_type': 'url_transparent',
3434 'ie_key': YoutubeIE.ie_key(),
3435 'id': video_id,
3436 'url': video_id,
3437 'title': title,
3438 'description': description,
3439 'duration': duration,
3440 'view_count': view_count,
3441 'uploader': uploader,
3442 }
3443 if total == n:
3444 return
3445 token = try_get(
3446 slr_contents,
3447 lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
3448 compat_str)
3449 if not token:
3450 break
3451 data['continuation'] = token
3452
3453 def _get_n_results(self, query, n):
3454 """Get a specified number of results for a query"""
3455 return self.playlist_result(self._entries(query, n), query)
3456
3457
3458 class YoutubeSearchDateIE(YoutubeSearchIE):
3459 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
3460 _SEARCH_KEY = 'ytsearchdate'
3461 IE_DESC = 'YouTube.com searches, newest videos first'
3462 _SEARCH_PARAMS = 'CAI%3D'
3463
3464
3465 class YoutubeSearchURLIE(YoutubeSearchIE):
3466 IE_DESC = 'YouTube.com search URLs'
3467 IE_NAME = YoutubeSearchIE.IE_NAME + '_url'
3468 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)'
3469 # _MAX_RESULTS = 100
3470 _TESTS = [{
3471 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
3472 'playlist_mincount': 5,
3473 'info_dict': {
3474 'title': 'youtube-dl test video',
3475 }
3476 }, {
3477 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
3478 'only_matching': True,
3479 }]
3480
3481 @classmethod
3482 def _make_valid_url(cls):
3483 return cls._VALID_URL
3484
3485 def _real_extract(self, url):
3486 qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
3487 query = (qs.get('search_query') or qs.get('q'))[0]
3488 self._SEARCH_PARAMS = qs.get('sp', ('',))[0]
3489 return self._get_n_results(query, self._MAX_RESULTS)
3490
3491
3492 class YoutubeFeedsInfoExtractor(YoutubeTabIE):
3493 """
3494 Base class for feed extractors
3495 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
3496 """
3497 _LOGIN_REQUIRED = True
3498 # _MAX_PAGES = 5
3499 _TESTS = []
3500
3501 @property
3502 def IE_NAME(self):
3503 return 'youtube:%s' % self._FEED_NAME
3504
3505 def _real_initialize(self):
3506 self._login()
3507
3508 def _shelf_entries(self, shelf_renderer):
3509 renderer = try_get(shelf_renderer, lambda x: x['content']['gridRenderer'], dict)
3510 if not renderer:
3511 return
3512 for entry in self._grid_entries(renderer):
3513 yield entry
3514
3515 def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
3516 selected_tab = self._extract_selected_tab(tabs)
3517 return self.playlist_result(
3518 self._entries(selected_tab['content'], identity_token),
3519 playlist_title=self._PLAYLIST_TITLE)
3520
3521 def _real_extract(self, url):
3522 item_id = self._FEED_NAME
3523 url = 'https://www.youtube.com/feed/%s' % self._FEED_NAME
3524 webpage = self._download_webpage(url, item_id)
3525 identity_token = self._search_regex(
3526 r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
3527 'identity token', default=None)
3528 data = self._extract_yt_initial_data(item_id, webpage)
3529 tabs = try_get(
3530 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
3531 if tabs:
3532 return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token)
3533 # Failed to recognize
3534 raise ExtractorError('Unable to recognize feed page')
3535
3536
3537 class YoutubeWatchLaterIE(InfoExtractor):
3538 IE_NAME = 'youtube:watchlater'
3539 IE_DESC = 'Youtube watch later list, ":ytwatchlater" or "WL" for short (requires authentication)'
3540 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/watch_later|:ytwatchlater|WL'
3541
3542 _TESTS = [{
3543 'url': 'https://www.youtube.com/feed/watch_later',
3544 'only_matching': True,
3545 }, {
3546 'url': ':ytwatchlater',
3547 'only_matching': True,
3548 }]
3549
3550 def _real_extract(self, url):
3551 return self.url_result(
3552 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key())
3553
3554
3555 class YoutubeFavouritesIE(InfoExtractor):
3556 IE_NAME = 'youtube:favourites'
3557 IE_DESC = 'YouTube.com liked videos, ":ytfav" or "LL" for short (requires authentication)'
3558 _VALID_URL = r':ytfav(?:ou?rite)?s?|LL'
3559
3560 _TESTS = [{
3561 'url': ':ytfav',
3562 'only_matching': True,
3563 }]
3564
3565 def _real_extract(self, url):
3566 return self.url_result(
3567 'https://www.youtube.com/playlist?list=LL', ie=YoutubeTabIE.ie_key())
3568
3569
3570 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3571 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
3572 _VALID_URL = r'https?://(?:www\.)?youtube\.com(?:/feed/recommended|/?[?#]|/?$)|:ytrec(?:ommended)?'
3573 _FEED_NAME = 'recommended'
3574 _PLAYLIST_TITLE = 'Youtube Recommended videos'
3575
3576
3577 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
3578 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
3579 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsub(?:scription)?s?'
3580 _FEED_NAME = 'subscriptions'
3581 _PLAYLIST_TITLE = 'Youtube Subscriptions'
3582
3583
3584 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
3585 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
3586 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
3587 _FEED_NAME = 'history'
3588 _PLAYLIST_TITLE = 'Youtube History'
3589
3590
3591 class YoutubeTruncatedURLIE(InfoExtractor):
3592 IE_NAME = 'youtube:truncated_url'
3593 IE_DESC = False # Do not list
3594 _VALID_URL = r'''(?x)
3595 (?:https?://)?
3596 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3597 (?:watch\?(?:
3598 feature=[a-z_]+|
3599 annotation_id=annotation_[^&]+|
3600 x-yt-cl=[0-9]+|
3601 hl=[^&]*|
3602 t=[0-9]+
3603 )?
3604 |
3605 attribution_link\?a=[^&]+
3606 )
3607 $
3608 '''
3609
3610 _TESTS = [{
3611 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
3612 'only_matching': True,
3613 }, {
3614 'url': 'https://www.youtube.com/watch?',
3615 'only_matching': True,
3616 }, {
3617 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3618 'only_matching': True,
3619 }, {
3620 'url': 'https://www.youtube.com/watch?feature=foo',
3621 'only_matching': True,
3622 }, {
3623 'url': 'https://www.youtube.com/watch?hl=en-GB',
3624 'only_matching': True,
3625 }, {
3626 'url': 'https://www.youtube.com/watch?t=2372',
3627 'only_matching': True,
3628 }]
3629
3630 def _real_extract(self, url):
3631 raise ExtractorError(
3632 'Did you forget to quote the URL? Remember that & is a meta '
3633 'character in most shells, so you want to put the URL in quotes, '
3634 'like youtube-dl '
3635 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3636 ' or simply youtube-dl BaW_jenozKc .',
3637 expected=True)
3638
3639
3640 class YoutubeTruncatedIDIE(InfoExtractor):
3641 IE_NAME = 'youtube:truncated_id'
3642 IE_DESC = False # Do not list
3643 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
3644
3645 _TESTS = [{
3646 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3647 'only_matching': True,
3648 }]
3649
3650 def _real_extract(self, url):
3651 video_id = self._match_id(url)
3652 raise ExtractorError(
3653 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3654 expected=True)
3655
3656
3657 # Do Youtube show urls even exist anymore? I couldn't find any
3658 r'''
3659 class YoutubeShowIE(YoutubeTabIE):
3660 IE_DESC = 'YouTube.com (multi-season) shows'
3661 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
3662 IE_NAME = 'youtube:show'
3663 _TESTS = [{
3664 'url': 'https://www.youtube.com/show/airdisasters',
3665 'playlist_mincount': 5,
3666 'info_dict': {
3667 'id': 'airdisasters',
3668 'title': 'Air Disasters',
3669 }
3670 }]
3671
3672 def _real_extract(self, url):
3673 playlist_id = self._match_id(url)
3674 return super(YoutubeShowIE, self)._real_extract(
3675 'https://www.youtube.com/show/%s/playlists' % playlist_id)
3676 '''