]> jfr.im git - yt-dlp.git/blame_incremental - youtube_dlc/extractor/youtube.py
Minor changes to make it easier to merge
[yt-dlp.git] / youtube_dlc / extractor / youtube.py
... / ...
CommitLineData
1# coding: utf-8
2
3from __future__ import unicode_literals
4
5
6import itertools
7import json
8import os.path
9import random
10import re
11import time
12import traceback
13
14from .common import InfoExtractor, SearchInfoExtractor
15from ..jsinterp import JSInterpreter
16from ..swfinterp import SWFInterpreter
17from ..compat import (
18 compat_chr,
19 compat_kwargs,
20 compat_parse_qs,
21 compat_urllib_parse_unquote,
22 compat_urllib_parse_unquote_plus,
23 compat_urllib_parse_urlencode,
24 compat_urllib_parse_urlparse,
25 compat_urlparse,
26 compat_str,
27)
28from ..utils import (
29 bool_or_none,
30 clean_html,
31 error_to_compat_str,
32 ExtractorError,
33 float_or_none,
34 get_element_by_id,
35 int_or_none,
36 mimetype2ext,
37 parse_codecs,
38 parse_count,
39 parse_duration,
40 remove_quotes,
41 remove_start,
42 smuggle_url,
43 str_or_none,
44 str_to_int,
45 try_get,
46 unescapeHTML,
47 unified_strdate,
48 unsmuggle_url,
49 update_url_query,
50 uppercase_escape,
51 url_or_none,
52 urlencode_postdata,
53 urljoin,
54)
55
56
57class YoutubeBaseInfoExtractor(InfoExtractor):
58 """Provide base functions for Youtube extractors"""
59 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
60 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
61
62 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
63 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
64 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
65
66 _RESERVED_NAMES = (
67 r'course|embed|channel|c|user|playlist|watch|w|results|storefront|'
68 r'shared|index|account|reporthistory|t/terms|about|upload|signin|logout|'
69 r'feed/(watch_later|history|subscriptions|library|trending|recommended)')
70
71 _NETRC_MACHINE = 'youtube'
72 # If True it will raise an error if no login info is provided
73 _LOGIN_REQUIRED = False
74
75 _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)'
76
77 _YOUTUBE_CLIENT_HEADERS = {
78 'x-youtube-client-name': '1',
79 'x-youtube-client-version': '1.20200609.04.02',
80 }
81
82 def _set_language(self):
83 self._set_cookie(
84 '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
85 # YouTube sets the expire time to about two months
86 expire_time=time.time() + 2 * 30 * 24 * 3600)
87
88 def _ids_to_results(self, ids):
89 return [
90 self.url_result(vid_id, 'Youtube', video_id=vid_id)
91 for vid_id in ids]
92
93 def _login(self):
94 """
95 Attempt to log in to YouTube.
96 True is returned if successful or skipped.
97 False is returned if login failed.
98
99 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
100 """
101 username, password = self._get_login_info()
102 # No authentication to be performed
103 if username is None:
104 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
105 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
106 if self._downloader.params.get('cookiefile') and False: # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
107 self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
108 return True
109
110 login_page = self._download_webpage(
111 self._LOGIN_URL, None,
112 note='Downloading login page',
113 errnote='unable to fetch login page', fatal=False)
114 if login_page is False:
115 return
116
117 login_form = self._hidden_inputs(login_page)
118
119 def req(url, f_req, note, errnote):
120 data = login_form.copy()
121 data.update({
122 'pstMsg': 1,
123 'checkConnection': 'youtube',
124 'checkedDomains': 'youtube',
125 'hl': 'en',
126 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
127 'f.req': json.dumps(f_req),
128 'flowName': 'GlifWebSignIn',
129 'flowEntry': 'ServiceLogin',
130 # TODO: reverse actual botguard identifier generation algo
131 'bgRequest': '["identifier",""]',
132 })
133 return self._download_json(
134 url, None, note=note, errnote=errnote,
135 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
136 fatal=False,
137 data=urlencode_postdata(data), headers={
138 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
139 'Google-Accounts-XSRF': 1,
140 })
141
142 def warn(message):
143 self._downloader.report_warning(message)
144
145 lookup_req = [
146 username,
147 None, [], None, 'US', None, None, 2, False, True,
148 [
149 None, None,
150 [2, 1, None, 1,
151 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
152 None, [], 4],
153 1, [None, None, []], None, None, None, True
154 ],
155 username,
156 ]
157
158 lookup_results = req(
159 self._LOOKUP_URL, lookup_req,
160 'Looking up account info', 'Unable to look up account info')
161
162 if lookup_results is False:
163 return False
164
165 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
166 if not user_hash:
167 warn('Unable to extract user hash')
168 return False
169
170 challenge_req = [
171 user_hash,
172 None, 1, None, [1, None, None, None, [password, None, True]],
173 [
174 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
175 1, [None, None, []], None, None, None, True
176 ]]
177
178 challenge_results = req(
179 self._CHALLENGE_URL, challenge_req,
180 'Logging in', 'Unable to log in')
181
182 if challenge_results is False:
183 return
184
185 login_res = try_get(challenge_results, lambda x: x[0][5], list)
186 if login_res:
187 login_msg = try_get(login_res, lambda x: x[5], compat_str)
188 warn(
189 'Unable to login: %s' % 'Invalid password'
190 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
191 return False
192
193 res = try_get(challenge_results, lambda x: x[0][-1], list)
194 if not res:
195 warn('Unable to extract result entry')
196 return False
197
198 login_challenge = try_get(res, lambda x: x[0][0], list)
199 if login_challenge:
200 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
201 if challenge_str == 'TWO_STEP_VERIFICATION':
202 # SEND_SUCCESS - TFA code has been successfully sent to phone
203 # QUOTA_EXCEEDED - reached the limit of TFA codes
204 status = try_get(login_challenge, lambda x: x[5], compat_str)
205 if status == 'QUOTA_EXCEEDED':
206 warn('Exceeded the limit of TFA codes, try later')
207 return False
208
209 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
210 if not tl:
211 warn('Unable to extract TL')
212 return False
213
214 tfa_code = self._get_tfa_info('2-step verification code')
215
216 if not tfa_code:
217 warn(
218 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
219 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
220 return False
221
222 tfa_code = remove_start(tfa_code, 'G-')
223
224 tfa_req = [
225 user_hash, None, 2, None,
226 [
227 9, None, None, None, None, None, None, None,
228 [None, tfa_code, True, 2]
229 ]]
230
231 tfa_results = req(
232 self._TFA_URL.format(tl), tfa_req,
233 'Submitting TFA code', 'Unable to submit TFA code')
234
235 if tfa_results is False:
236 return False
237
238 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
239 if tfa_res:
240 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
241 warn(
242 'Unable to finish TFA: %s' % 'Invalid TFA code'
243 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
244 return False
245
246 check_cookie_url = try_get(
247 tfa_results, lambda x: x[0][-1][2], compat_str)
248 else:
249 CHALLENGES = {
250 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
251 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
252 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
253 }
254 challenge = CHALLENGES.get(
255 challenge_str,
256 '%s returned error %s.' % (self.IE_NAME, challenge_str))
257 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
258 return False
259 else:
260 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
261
262 if not check_cookie_url:
263 warn('Unable to extract CheckCookie URL')
264 return False
265
266 check_cookie_results = self._download_webpage(
267 check_cookie_url, None, 'Checking cookie', fatal=False)
268
269 if check_cookie_results is False:
270 return False
271
272 if 'https://myaccount.google.com/' not in check_cookie_results:
273 warn('Unable to log in')
274 return False
275
276 return True
277
278 def _download_webpage_handle(self, *args, **kwargs):
279 query = kwargs.get('query', {}).copy()
280 kwargs['query'] = query
281 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
282 *args, **compat_kwargs(kwargs))
283
284 def _get_yt_initial_data(self, video_id, webpage):
285 config = self._search_regex(
286 (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
287 r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
288 webpage, 'ytInitialData', default=None)
289 if config:
290 return self._parse_json(
291 uppercase_escape(config), video_id, fatal=False)
292
293 def _real_initialize(self):
294 if self._downloader is None:
295 return
296 self._set_language()
297 if not self._login():
298 return
299
300 _DEFAULT_API_DATA = {
301 'context': {
302 'client': {
303 'clientName': 'WEB',
304 'clientVersion': '2.20201021.03.00',
305 }
306 },
307 }
308
309 _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
310
311 def _call_api(self, ep, query, video_id):
312 data = self._DEFAULT_API_DATA.copy()
313 data.update(query)
314
315 response = self._download_json(
316 'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id,
317 note='Downloading API JSON', errnote='Unable to download API page',
318 data=json.dumps(data).encode('utf8'),
319 headers={'content-type': 'application/json'},
320 query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'})
321
322 return response
323
324 def _extract_yt_initial_data(self, video_id, webpage):
325 return self._parse_json(
326 self._search_regex(
327 (r'%s\s*\n' % self._YT_INITIAL_DATA_RE,
328 self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
329 video_id)
330
331
332class YoutubeIE(YoutubeBaseInfoExtractor):
333 IE_DESC = 'YouTube.com'
334 _VALID_URL = r"""(?x)^
335 (
336 (?:https?://|//) # http(s):// or protocol-independent URL
337 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
338 (?:www\.)?deturl\.com/www\.youtube\.com/|
339 (?:www\.)?pwnyoutube\.com/|
340 (?:www\.)?hooktube\.com/|
341 (?:www\.)?yourepeat\.com/|
342 tube\.majestyc\.net/|
343 # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
344 (?:(?:www|dev)\.)?invidio\.us/|
345 (?:(?:www|no)\.)?invidiou\.sh/|
346 (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/|
347 (?:www\.)?invidious\.kabi\.tk/|
348 (?:www\.)?invidious\.13ad\.de/|
349 (?:www\.)?invidious\.mastodon\.host/|
350 (?:www\.)?invidious\.nixnet\.xyz/|
351 (?:www\.)?invidious\.drycat\.fr/|
352 (?:www\.)?tube\.poal\.co/|
353 (?:www\.)?vid\.wxzm\.sx/|
354 (?:www\.)?yewtu\.be/|
355 (?:www\.)?yt\.elukerio\.org/|
356 (?:www\.)?yt\.lelux\.fi/|
357 (?:www\.)?invidious\.ggc-project\.de/|
358 (?:www\.)?yt\.maisputain\.ovh/|
359 (?:www\.)?invidious\.13ad\.de/|
360 (?:www\.)?invidious\.toot\.koeln/|
361 (?:www\.)?invidious\.fdn\.fr/|
362 (?:www\.)?watch\.nettohikari\.com/|
363 (?:www\.)?kgg2m7yk5aybusll\.onion/|
364 (?:www\.)?qklhadlycap4cnod\.onion/|
365 (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
366 (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
367 (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
368 (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
369 (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
370 (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
371 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
372 (?:.*?\#/)? # handle anchor (#/) redirect urls
373 (?: # the various things that can precede the ID:
374 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
375 |(?: # or the v= param in all its forms
376 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
377 (?:\?|\#!?) # the params delimiter ? or # or #!
378 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
379 v=
380 )
381 ))
382 |(?:
383 youtu\.be| # just youtu.be/xxxx
384 vid\.plus| # or vid.plus/xxxx
385 zwearz\.com/watch| # or zwearz.com/watch/xxxx
386 )/
387 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
388 )
389 )? # all until now is optional -> you can pass the naked ID
390 (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
391 (?!.*?\blist=
392 (?:
393 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
394 WL # WL are handled by the watch later IE
395 )
396 )
397 (?(1).+)? # if we found the ID, everything can follow
398 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
399 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
400 _PLAYER_INFO_RE = (
401 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P<ext>[a-z]+)$',
402 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.(?P<ext>[a-z]+)$',
403 )
404 _formats = {
405 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
406 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
407 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
408 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
409 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
410 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
411 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
412 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
413 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
414 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
415 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
416 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
417 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
418 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
419 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
420 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
421 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
422 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
423
424
425 # 3D videos
426 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
427 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
428 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
429 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
430 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
431 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
432 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
433
434 # Apple HTTP Live Streaming
435 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
436 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
437 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
438 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
439 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
440 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
441 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
442 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
443
444 # DASH mp4 video
445 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
446 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
447 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
448 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
449 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
450 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
451 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
452 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
453 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
454 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
455 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
456 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
457
458 # Dash mp4 audio
459 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
460 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
461 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
462 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
463 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
464 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
465 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
466
467 # Dash webm
468 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
469 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
470 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
471 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
472 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
473 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
474 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
475 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
476 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
477 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
478 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
479 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
480 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
481 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
482 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
483 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
484 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
485 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
486 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
487 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
488 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
489 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
490
491 # Dash webm audio
492 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
493 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
494
495 # Dash webm audio with opus inside
496 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
497 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
498 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
499
500 # RTMP (unnamed)
501 '_rtmp': {'protocol': 'rtmp'},
502
503 # av01 video only formats sometimes served with "unknown" codecs
504 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
505 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
506 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
507 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
508 }
509 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
510
511 _GEO_BYPASS = False
512
513 IE_NAME = 'youtube'
514 _TESTS = [
515 {
516 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
517 'info_dict': {
518 'id': 'BaW_jenozKc',
519 'ext': 'mp4',
520 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
521 'uploader': 'Philipp Hagemeister',
522 'uploader_id': 'phihag',
523 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
524 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
525 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
526 'upload_date': '20121002',
527 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
528 'categories': ['Science & Technology'],
529 'tags': ['youtube-dl'],
530 'duration': 10,
531 'view_count': int,
532 'like_count': int,
533 'dislike_count': int,
534 'start_time': 1,
535 'end_time': 9,
536 }
537 },
538 {
539 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
540 'note': 'Embed-only video (#1746)',
541 'info_dict': {
542 'id': 'yZIXLfi8CZQ',
543 'ext': 'mp4',
544 'upload_date': '20120608',
545 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
546 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
547 'uploader': 'SET India',
548 'uploader_id': 'setindia',
549 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
550 'age_limit': 18,
551 }
552 },
553 {
554 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ',
555 'note': 'Use the first video ID in the URL',
556 'info_dict': {
557 'id': 'BaW_jenozKc',
558 'ext': 'mp4',
559 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
560 'uploader': 'Philipp Hagemeister',
561 'uploader_id': 'phihag',
562 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
563 'upload_date': '20121002',
564 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
565 'categories': ['Science & Technology'],
566 'tags': ['youtube-dl'],
567 'duration': 10,
568 'view_count': int,
569 'like_count': int,
570 'dislike_count': int,
571 },
572 'params': {
573 'skip_download': True,
574 },
575 },
576 {
577 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
578 'note': '256k DASH audio (format 141) via DASH manifest',
579 'info_dict': {
580 'id': 'a9LDPn-MO4I',
581 'ext': 'm4a',
582 'upload_date': '20121002',
583 'uploader_id': '8KVIDEO',
584 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
585 'description': '',
586 'uploader': '8KVIDEO',
587 'title': 'UHDTV TEST 8K VIDEO.mp4'
588 },
589 'params': {
590 'youtube_include_dash_manifest': True,
591 'format': '141',
592 },
593 'skip': 'format 141 not served anymore',
594 },
595 # DASH manifest with encrypted signature
596 {
597 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
598 'info_dict': {
599 'id': 'IB3lcPjvWLA',
600 'ext': 'm4a',
601 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
602 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
603 'duration': 244,
604 'uploader': 'AfrojackVEVO',
605 'uploader_id': 'AfrojackVEVO',
606 'upload_date': '20131011',
607 },
608 'params': {
609 'youtube_include_dash_manifest': True,
610 'format': '141/bestaudio[ext=m4a]',
611 },
612 },
613 # Controversy video
614 {
615 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
616 'info_dict': {
617 'id': 'T4XJQO3qol8',
618 'ext': 'mp4',
619 'duration': 219,
620 'upload_date': '20100909',
621 'uploader': 'Amazing Atheist',
622 'uploader_id': 'TheAmazingAtheist',
623 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
624 'title': 'Burning Everyone\'s Koran',
625 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
626 }
627 },
628 # Normal age-gate video (embed allowed)
629 {
630 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
631 'info_dict': {
632 'id': 'HtVdAasjOgU',
633 'ext': 'mp4',
634 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
635 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
636 'duration': 142,
637 'uploader': 'The Witcher',
638 'uploader_id': 'WitcherGame',
639 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
640 'upload_date': '20140605',
641 'age_limit': 18,
642 },
643 },
644 # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
645 # YouTube Red ad is not captured for creator
646 {
647 'url': '__2ABJjxzNo',
648 'info_dict': {
649 'id': '__2ABJjxzNo',
650 'ext': 'mp4',
651 'duration': 266,
652 'upload_date': '20100430',
653 'uploader_id': 'deadmau5',
654 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
655 'creator': 'Dada Life, deadmau5',
656 'description': 'md5:12c56784b8032162bb936a5f76d55360',
657 'uploader': 'deadmau5',
658 'title': 'Deadmau5 - Some Chords (HD)',
659 'alt_title': 'This Machine Kills Some Chords',
660 },
661 'expected_warnings': [
662 'DASH manifest missing',
663 ]
664 },
665 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
666 {
667 'url': 'lqQg6PlCWgI',
668 'info_dict': {
669 'id': 'lqQg6PlCWgI',
670 'ext': 'mp4',
671 'duration': 6085,
672 'upload_date': '20150827',
673 'uploader_id': 'olympic',
674 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
675 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
676 'uploader': 'Olympic',
677 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
678 },
679 'params': {
680 'skip_download': 'requires avconv',
681 }
682 },
683 # Non-square pixels
684 {
685 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
686 'info_dict': {
687 'id': '_b-2C3KPAM0',
688 'ext': 'mp4',
689 'stretched_ratio': 16 / 9.,
690 'duration': 85,
691 'upload_date': '20110310',
692 'uploader_id': 'AllenMeow',
693 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
694 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
695 'uploader': '孫ᄋᄅ',
696 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
697 },
698 },
699 # url_encoded_fmt_stream_map is empty string
700 {
701 'url': 'qEJwOuvDf7I',
702 'info_dict': {
703 'id': 'qEJwOuvDf7I',
704 'ext': 'webm',
705 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
706 'description': '',
707 'upload_date': '20150404',
708 'uploader_id': 'spbelect',
709 'uploader': 'Наблюдатели Петербурга',
710 },
711 'params': {
712 'skip_download': 'requires avconv',
713 },
714 'skip': 'This live event has ended.',
715 },
716 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
717 {
718 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
719 'info_dict': {
720 'id': 'FIl7x6_3R5Y',
721 'ext': 'webm',
722 'title': 'md5:7b81415841e02ecd4313668cde88737a',
723 'description': 'md5:116377fd2963b81ec4ce64b542173306',
724 'duration': 220,
725 'upload_date': '20150625',
726 'uploader_id': 'dorappi2000',
727 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
728 'uploader': 'dorappi2000',
729 'formats': 'mincount:31',
730 },
731 'skip': 'not actual anymore',
732 },
733 # DASH manifest with segment_list
734 {
735 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
736 'md5': '8ce563a1d667b599d21064e982ab9e31',
737 'info_dict': {
738 'id': 'CsmdDsKjzN8',
739 'ext': 'mp4',
740 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
741 'uploader': 'Airtek',
742 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
743 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
744 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
745 },
746 'params': {
747 'youtube_include_dash_manifest': True,
748 'format': '135', # bestvideo
749 },
750 'skip': 'This live event has ended.',
751 },
752 {
753 # Multifeed videos (multiple cameras), URL is for Main Camera
754 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
755 'info_dict': {
756 'id': 'jqWvoWXjCVs',
757 'title': 'teamPGP: Rocket League Noob Stream',
758 'description': 'md5:dc7872fb300e143831327f1bae3af010',
759 },
760 'playlist': [{
761 'info_dict': {
762 'id': 'jqWvoWXjCVs',
763 'ext': 'mp4',
764 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
765 'description': 'md5:dc7872fb300e143831327f1bae3af010',
766 'duration': 7335,
767 'upload_date': '20150721',
768 'uploader': 'Beer Games Beer',
769 'uploader_id': 'beergamesbeer',
770 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
771 'license': 'Standard YouTube License',
772 },
773 }, {
774 'info_dict': {
775 'id': '6h8e8xoXJzg',
776 'ext': 'mp4',
777 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
778 'description': 'md5:dc7872fb300e143831327f1bae3af010',
779 'duration': 7337,
780 'upload_date': '20150721',
781 'uploader': 'Beer Games Beer',
782 'uploader_id': 'beergamesbeer',
783 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
784 'license': 'Standard YouTube License',
785 },
786 }, {
787 'info_dict': {
788 'id': 'PUOgX5z9xZw',
789 'ext': 'mp4',
790 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
791 'description': 'md5:dc7872fb300e143831327f1bae3af010',
792 'duration': 7337,
793 'upload_date': '20150721',
794 'uploader': 'Beer Games Beer',
795 'uploader_id': 'beergamesbeer',
796 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
797 'license': 'Standard YouTube License',
798 },
799 }, {
800 'info_dict': {
801 'id': 'teuwxikvS5k',
802 'ext': 'mp4',
803 'title': 'teamPGP: Rocket League Noob Stream (zim)',
804 'description': 'md5:dc7872fb300e143831327f1bae3af010',
805 'duration': 7334,
806 'upload_date': '20150721',
807 'uploader': 'Beer Games Beer',
808 'uploader_id': 'beergamesbeer',
809 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
810 'license': 'Standard YouTube License',
811 },
812 }],
813 'params': {
814 'skip_download': True,
815 },
816 'skip': 'This video is not available.',
817 },
818 {
819 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
820 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
821 'info_dict': {
822 'id': 'gVfLd0zydlo',
823 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
824 },
825 'playlist_count': 2,
826 'skip': 'Not multifeed anymore',
827 },
828 {
829 'url': 'https://vid.plus/FlRa-iH7PGw',
830 'only_matching': True,
831 },
832 {
833 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
834 'only_matching': True,
835 },
836 {
837 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
838 # Also tests cut-off URL expansion in video description (see
839 # https://github.com/ytdl-org/youtube-dl/issues/1892,
840 # https://github.com/ytdl-org/youtube-dl/issues/8164)
841 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
842 'info_dict': {
843 'id': 'lsguqyKfVQg',
844 'ext': 'mp4',
845 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
846 'alt_title': 'Dark Walk - Position Music',
847 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
848 'duration': 133,
849 'upload_date': '20151119',
850 'uploader_id': 'IronSoulElf',
851 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
852 'uploader': 'IronSoulElf',
853 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
854 'track': 'Dark Walk - Position Music',
855 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
856 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
857 },
858 'params': {
859 'skip_download': True,
860 },
861 },
862 {
863 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
864 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
865 'only_matching': True,
866 },
867 {
868 # Video with yt:stretch=17:0
869 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
870 'info_dict': {
871 'id': 'Q39EVAstoRM',
872 'ext': 'mp4',
873 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
874 'description': 'md5:ee18a25c350637c8faff806845bddee9',
875 'upload_date': '20151107',
876 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
877 'uploader': 'CH GAMER DROID',
878 },
879 'params': {
880 'skip_download': True,
881 },
882 'skip': 'This video does not exist.',
883 },
884 {
885 # Video licensed under Creative Commons
886 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
887 'info_dict': {
888 'id': 'M4gD1WSo5mA',
889 'ext': 'mp4',
890 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
891 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
892 'duration': 721,
893 'upload_date': '20150127',
894 'uploader_id': 'BerkmanCenter',
895 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
896 'uploader': 'The Berkman Klein Center for Internet & Society',
897 'license': 'Creative Commons Attribution license (reuse allowed)',
898 },
899 'params': {
900 'skip_download': True,
901 },
902 },
903 {
904 # Channel-like uploader_url
905 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
906 'info_dict': {
907 'id': 'eQcmzGIKrzg',
908 'ext': 'mp4',
909 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
910 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
911 'duration': 4060,
912 'upload_date': '20151119',
913 'uploader': 'Bernie Sanders',
914 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
915 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
916 'license': 'Creative Commons Attribution license (reuse allowed)',
917 },
918 'params': {
919 'skip_download': True,
920 },
921 },
922 {
923 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
924 'only_matching': True,
925 },
926 {
927 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
928 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
929 'only_matching': True,
930 },
931 {
932 # Rental video preview
933 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
934 'info_dict': {
935 'id': 'uGpuVWrhIzE',
936 'ext': 'mp4',
937 'title': 'Piku - Trailer',
938 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
939 'upload_date': '20150811',
940 'uploader': 'FlixMatrix',
941 'uploader_id': 'FlixMatrixKaravan',
942 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
943 'license': 'Standard YouTube License',
944 },
945 'params': {
946 'skip_download': True,
947 },
948 'skip': 'This video is not available.',
949 },
950 {
951 # YouTube Red video with episode data
952 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
953 'info_dict': {
954 'id': 'iqKdEhx-dD4',
955 'ext': 'mp4',
956 'title': 'Isolation - Mind Field (Ep 1)',
957 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
958 'duration': 2085,
959 'upload_date': '20170118',
960 'uploader': 'Vsauce',
961 'uploader_id': 'Vsauce',
962 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
963 'series': 'Mind Field',
964 'season_number': 1,
965 'episode_number': 1,
966 },
967 'params': {
968 'skip_download': True,
969 },
970 'expected_warnings': [
971 'Skipping DASH manifest',
972 ],
973 },
974 {
975 # The following content has been identified by the YouTube community
976 # as inappropriate or offensive to some audiences.
977 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
978 'info_dict': {
979 'id': '6SJNVb0GnPI',
980 'ext': 'mp4',
981 'title': 'Race Differences in Intelligence',
982 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
983 'duration': 965,
984 'upload_date': '20140124',
985 'uploader': 'New Century Foundation',
986 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
987 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
988 },
989 'params': {
990 'skip_download': True,
991 },
992 },
993 {
994 # itag 212
995 'url': '1t24XAntNCY',
996 'only_matching': True,
997 },
998 {
999 # geo restricted to JP
1000 'url': 'sJL6WA-aGkQ',
1001 'only_matching': True,
1002 },
1003 {
1004 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1005 'only_matching': True,
1006 },
1007 {
1008 # DRM protected
1009 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1010 'only_matching': True,
1011 },
1012 {
1013 # Video with unsupported adaptive stream type formats
1014 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1015 'info_dict': {
1016 'id': 'Z4Vy8R84T1U',
1017 'ext': 'mp4',
1018 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1019 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1020 'duration': 433,
1021 'upload_date': '20130923',
1022 'uploader': 'Amelia Putri Harwita',
1023 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1024 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1025 'formats': 'maxcount:10',
1026 },
1027 'params': {
1028 'skip_download': True,
1029 'youtube_include_dash_manifest': False,
1030 },
1031 'skip': 'not actual anymore',
1032 },
1033 {
1034 # Youtube Music Auto-generated description
1035 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1036 'info_dict': {
1037 'id': 'MgNrAu2pzNs',
1038 'ext': 'mp4',
1039 'title': 'Voyeur Girl',
1040 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1041 'upload_date': '20190312',
1042 'uploader': 'Stephen - Topic',
1043 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
1044 'artist': 'Stephen',
1045 'track': 'Voyeur Girl',
1046 'album': 'it\'s too much love to know my dear',
1047 'release_date': '20190313',
1048 'release_year': 2019,
1049 },
1050 'params': {
1051 'skip_download': True,
1052 },
1053 },
1054 {
1055 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1056 'only_matching': True,
1057 },
1058 {
1059 # invalid -> valid video id redirection
1060 'url': 'DJztXj2GPfl',
1061 'info_dict': {
1062 'id': 'DJztXj2GPfk',
1063 'ext': 'mp4',
1064 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1065 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1066 'upload_date': '20090125',
1067 'uploader': 'Prochorowka',
1068 'uploader_id': 'Prochorowka',
1069 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1070 'artist': 'Panjabi MC',
1071 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1072 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1073 },
1074 'params': {
1075 'skip_download': True,
1076 },
1077 },
1078 {
1079 # empty description results in an empty string
1080 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1081 'info_dict': {
1082 'id': 'x41yOUIvK2k',
1083 'ext': 'mp4',
1084 'title': 'IMG 3456',
1085 'description': '',
1086 'upload_date': '20170613',
1087 'uploader_id': 'ElevageOrVert',
1088 'uploader': 'ElevageOrVert',
1089 },
1090 'params': {
1091 'skip_download': True,
1092 },
1093 },
1094 {
1095 # with '};' inside yt initial data (see https://github.com/ytdl-org/youtube-dl/issues/27093)
1096 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no',
1097 'info_dict': {
1098 'id': 'CHqg6qOn4no',
1099 'ext': 'mp4',
1100 'title': 'Part 77 Sort a list of simple types in c#',
1101 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc',
1102 'upload_date': '20130831',
1103 'uploader_id': 'kudvenkat',
1104 'uploader': 'kudvenkat',
1105 },
1106 'params': {
1107 'skip_download': True,
1108 },
1109 },
1110 ]
1111
1112 def __init__(self, *args, **kwargs):
1113 super(YoutubeIE, self).__init__(*args, **kwargs)
1114 self._player_cache = {}
1115
1116 def report_video_info_webpage_download(self, video_id):
1117 """Report attempt to download video info webpage."""
1118 self.to_screen('%s: Downloading video info webpage' % video_id)
1119
1120 def report_information_extraction(self, video_id):
1121 """Report attempt to extract video information."""
1122 self.to_screen('%s: Extracting video information' % video_id)
1123
1124 def report_unavailable_format(self, video_id, format):
1125 """Report extracted video URL."""
1126 self.to_screen('%s: Format %s not available' % (video_id, format))
1127
1128 def report_rtmp_download(self):
1129 """Indicate the download will use the RTMP protocol."""
1130 self.to_screen('RTMP download detected')
1131
1132 def _signature_cache_id(self, example_sig):
1133 """ Return a string representation of a signature """
1134 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
1135
1136 @classmethod
1137 def _extract_player_info(cls, player_url):
1138 for player_re in cls._PLAYER_INFO_RE:
1139 id_m = re.search(player_re, player_url)
1140 if id_m:
1141 break
1142 else:
1143 raise ExtractorError('Cannot identify player %r' % player_url)
1144 return id_m.group('ext'), id_m.group('id')
1145
1146 def _extract_signature_function(self, video_id, player_url, example_sig):
1147 player_type, player_id = self._extract_player_info(player_url)
1148
1149 # Read from filesystem cache
1150 func_id = '%s_%s_%s' % (
1151 player_type, player_id, self._signature_cache_id(example_sig))
1152 assert os.path.basename(func_id) == func_id
1153
1154 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
1155 if cache_spec is not None:
1156 return lambda s: ''.join(s[i] for i in cache_spec)
1157
1158 download_note = (
1159 'Downloading player %s' % player_url
1160 if self._downloader.params.get('verbose') else
1161 'Downloading %s player %s' % (player_type, player_id)
1162 )
1163 if player_type == 'js':
1164 code = self._download_webpage(
1165 player_url, video_id,
1166 note=download_note,
1167 errnote='Download of %s failed' % player_url)
1168 res = self._parse_sig_js(code)
1169 elif player_type == 'swf':
1170 urlh = self._request_webpage(
1171 player_url, video_id,
1172 note=download_note,
1173 errnote='Download of %s failed' % player_url)
1174 code = urlh.read()
1175 res = self._parse_sig_swf(code)
1176 else:
1177 assert False, 'Invalid player type %r' % player_type
1178
1179 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1180 cache_res = res(test_string)
1181 cache_spec = [ord(c) for c in cache_res]
1182
1183 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
1184 return res
1185
1186 def _print_sig_code(self, func, example_sig):
1187 def gen_sig_code(idxs):
1188 def _genslice(start, end, step):
1189 starts = '' if start == 0 else str(start)
1190 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
1191 steps = '' if step == 1 else (':%d' % step)
1192 return 's[%s%s%s]' % (starts, ends, steps)
1193
1194 step = None
1195 # Quelch pyflakes warnings - start will be set when step is set
1196 start = '(Never used)'
1197 for i, prev in zip(idxs[1:], idxs[:-1]):
1198 if step is not None:
1199 if i - prev == step:
1200 continue
1201 yield _genslice(start, prev, step)
1202 step = None
1203 continue
1204 if i - prev in [-1, 1]:
1205 step = i - prev
1206 start = prev
1207 continue
1208 else:
1209 yield 's[%d]' % prev
1210 if step is None:
1211 yield 's[%d]' % i
1212 else:
1213 yield _genslice(start, i, step)
1214
1215 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1216 cache_res = func(test_string)
1217 cache_spec = [ord(c) for c in cache_res]
1218 expr_code = ' + '.join(gen_sig_code(cache_spec))
1219 signature_id_tuple = '(%s)' % (
1220 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
1221 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
1222 ' return %s\n') % (signature_id_tuple, expr_code)
1223 self.to_screen('Extracted signature function:\n' + code)
1224
1225 def _parse_sig_js(self, jscode):
1226 funcname = self._search_regex(
1227 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1228 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1229 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1230 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1231 # Obsolete patterns
1232 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1233 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
1234 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1235 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1236 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1237 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1238 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1239 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
1240 jscode, 'Initial JS player signature function name', group='sig')
1241
1242 jsi = JSInterpreter(jscode)
1243 initial_function = jsi.extract_function(funcname)
1244 return lambda s: initial_function([s])
1245
1246 def _parse_sig_swf(self, file_contents):
1247 swfi = SWFInterpreter(file_contents)
1248 TARGET_CLASSNAME = 'SignatureDecipher'
1249 searched_class = swfi.extract_class(TARGET_CLASSNAME)
1250 initial_function = swfi.extract_function(searched_class, 'decipher')
1251 return lambda s: initial_function([s])
1252
1253 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1254 """Turn the encrypted s field into a working signature"""
1255
1256 if player_url is None:
1257 raise ExtractorError('Cannot decrypt signature without player_url')
1258
1259 if player_url.startswith('//'):
1260 player_url = 'https:' + player_url
1261 elif not re.match(r'https?://', player_url):
1262 player_url = compat_urlparse.urljoin(
1263 'https://www.youtube.com', player_url)
1264 try:
1265 player_id = (player_url, self._signature_cache_id(s))
1266 if player_id not in self._player_cache:
1267 func = self._extract_signature_function(
1268 video_id, player_url, s
1269 )
1270 self._player_cache[player_id] = func
1271 func = self._player_cache[player_id]
1272 if self._downloader.params.get('youtube_print_sig_code'):
1273 self._print_sig_code(func, s)
1274 return func(s)
1275 except Exception as e:
1276 tb = traceback.format_exc()
1277 raise ExtractorError(
1278 'Signature extraction failed: ' + tb, cause=e)
1279
1280 def _get_subtitles(self, video_id, webpage, has_live_chat_replay):
1281 try:
1282 subs_doc = self._download_xml(
1283 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1284 video_id, note=False)
1285 except ExtractorError as err:
1286 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
1287 return {}
1288
1289 sub_lang_list = {}
1290 for track in subs_doc.findall('track'):
1291 lang = track.attrib['lang_code']
1292 if lang in sub_lang_list:
1293 continue
1294 sub_formats = []
1295 for ext in self._SUBTITLE_FORMATS:
1296 params = compat_urllib_parse_urlencode({
1297 'lang': lang,
1298 'v': video_id,
1299 'fmt': ext,
1300 'name': track.attrib['name'].encode('utf-8'),
1301 })
1302 sub_formats.append({
1303 'url': 'https://www.youtube.com/api/timedtext?' + params,
1304 'ext': ext,
1305 })
1306 sub_lang_list[lang] = sub_formats
1307 if has_live_chat_replay:
1308 sub_lang_list['live_chat'] = [
1309 {
1310 'video_id': video_id,
1311 'ext': 'json',
1312 'protocol': 'youtube_live_chat_replay',
1313 },
1314 ]
1315 if not sub_lang_list:
1316 self._downloader.report_warning('video doesn\'t have subtitles')
1317 return {}
1318 return sub_lang_list
1319
1320 def _get_ytplayer_config(self, video_id, webpage):
1321 patterns = (
1322 # User data may contain arbitrary character sequences that may affect
1323 # JSON extraction with regex, e.g. when '};' is contained the second
1324 # regex won't capture the whole JSON. Yet working around by trying more
1325 # concrete regex first keeping in mind proper quoted string handling
1326 # to be implemented in future that will replace this workaround (see
1327 # https://github.com/ytdl-org/youtube-dl/issues/7468,
1328 # https://github.com/ytdl-org/youtube-dl/pull/7599)
1329 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1330 r';ytplayer\.config\s*=\s*({.+?});',
1331 )
1332 config = self._search_regex(
1333 patterns, webpage, 'ytplayer.config', default=None)
1334 if config:
1335 return self._parse_json(
1336 uppercase_escape(config), video_id, fatal=False)
1337
1338 def _get_music_metadata_from_yt_initial(self, yt_initial):
1339 music_metadata = []
1340 key_map = {
1341 'Album': 'album',
1342 'Artist': 'artist',
1343 'Song': 'track'
1344 }
1345 contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'])
1346 if type(contents) is list:
1347 for content in contents:
1348 music_track = {}
1349 if type(content) is not dict:
1350 continue
1351 videoSecondaryInfoRenderer = try_get(content, lambda x: x['videoSecondaryInfoRenderer'])
1352 if type(videoSecondaryInfoRenderer) is not dict:
1353 continue
1354 rows = try_get(videoSecondaryInfoRenderer, lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'])
1355 if type(rows) is not list:
1356 continue
1357 for row in rows:
1358 metadataRowRenderer = try_get(row, lambda x: x['metadataRowRenderer'])
1359 if type(metadataRowRenderer) is not dict:
1360 continue
1361 key = try_get(metadataRowRenderer, lambda x: x['title']['simpleText'])
1362 value = try_get(metadataRowRenderer, lambda x: x['contents'][0]['simpleText']) or \
1363 try_get(metadataRowRenderer, lambda x: x['contents'][0]['runs'][0]['text'])
1364 if type(key) is not str or type(value) is not str:
1365 continue
1366 if key in key_map:
1367 if key_map[key] in music_track:
1368 # we've started on a new track
1369 music_metadata.append(music_track)
1370 music_track = {}
1371 music_track[key_map[key]] = value
1372 if len(music_track.keys()):
1373 music_metadata.append(music_track)
1374 return music_metadata
1375
1376 def _get_automatic_captions(self, video_id, webpage):
1377 """We need the webpage for getting the captions url, pass it as an
1378 argument to speed up the process."""
1379 self.to_screen('%s: Looking for automatic captions' % video_id)
1380 player_config = self._get_ytplayer_config(video_id, webpage)
1381 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
1382 if not player_config:
1383 self._downloader.report_warning(err_msg)
1384 return {}
1385 try:
1386 args = player_config['args']
1387 caption_url = args.get('ttsurl')
1388 if caption_url:
1389 timestamp = args['timestamp']
1390 # We get the available subtitles
1391 list_params = compat_urllib_parse_urlencode({
1392 'type': 'list',
1393 'tlangs': 1,
1394 'asrs': 1,
1395 })
1396 list_url = caption_url + '&' + list_params
1397 caption_list = self._download_xml(list_url, video_id)
1398 original_lang_node = caption_list.find('track')
1399 if original_lang_node is None:
1400 self._downloader.report_warning('Video doesn\'t have automatic captions')
1401 return {}
1402 original_lang = original_lang_node.attrib['lang_code']
1403 caption_kind = original_lang_node.attrib.get('kind', '')
1404
1405 sub_lang_list = {}
1406 for lang_node in caption_list.findall('target'):
1407 sub_lang = lang_node.attrib['lang_code']
1408 sub_formats = []
1409 for ext in self._SUBTITLE_FORMATS:
1410 params = compat_urllib_parse_urlencode({
1411 'lang': original_lang,
1412 'tlang': sub_lang,
1413 'fmt': ext,
1414 'ts': timestamp,
1415 'kind': caption_kind,
1416 })
1417 sub_formats.append({
1418 'url': caption_url + '&' + params,
1419 'ext': ext,
1420 })
1421 sub_lang_list[sub_lang] = sub_formats
1422 return sub_lang_list
1423
1424 def make_captions(sub_url, sub_langs):
1425 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1426 caption_qs = compat_parse_qs(parsed_sub_url.query)
1427 captions = {}
1428 for sub_lang in sub_langs:
1429 sub_formats = []
1430 for ext in self._SUBTITLE_FORMATS:
1431 caption_qs.update({
1432 'tlang': [sub_lang],
1433 'fmt': [ext],
1434 })
1435 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1436 query=compat_urllib_parse_urlencode(caption_qs, True)))
1437 sub_formats.append({
1438 'url': sub_url,
1439 'ext': ext,
1440 })
1441 captions[sub_lang] = sub_formats
1442 return captions
1443
1444 # New captions format as of 22.06.2017
1445 player_response = args.get('player_response')
1446 if player_response and isinstance(player_response, compat_str):
1447 player_response = self._parse_json(
1448 player_response, video_id, fatal=False)
1449 if player_response:
1450 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1451 base_url = renderer['captionTracks'][0]['baseUrl']
1452 sub_lang_list = []
1453 for lang in renderer['translationLanguages']:
1454 lang_code = lang.get('languageCode')
1455 if lang_code:
1456 sub_lang_list.append(lang_code)
1457 return make_captions(base_url, sub_lang_list)
1458
1459 # Some videos don't provide ttsurl but rather caption_tracks and
1460 # caption_translation_languages (e.g. 20LmZk1hakA)
1461 # Does not used anymore as of 22.06.2017
1462 caption_tracks = args['caption_tracks']
1463 caption_translation_languages = args['caption_translation_languages']
1464 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
1465 sub_lang_list = []
1466 for lang in caption_translation_languages.split(','):
1467 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1468 sub_lang = lang_qs.get('lc', [None])[0]
1469 if sub_lang:
1470 sub_lang_list.append(sub_lang)
1471 return make_captions(caption_url, sub_lang_list)
1472 # An extractor error can be raise by the download process if there are
1473 # no automatic captions but there are subtitles
1474 except (KeyError, IndexError, ExtractorError):
1475 self._downloader.report_warning(err_msg)
1476 return {}
1477
1478 def _mark_watched(self, video_id, video_info, player_response):
1479 playback_url = url_or_none(try_get(
1480 player_response,
1481 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
1482 video_info, lambda x: x['videostats_playback_base_url'][0]))
1483 if not playback_url:
1484 return
1485 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1486 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1487
1488 # cpn generation algorithm is reverse engineered from base.js.
1489 # In fact it works even with dummy cpn.
1490 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1491 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1492
1493 qs.update({
1494 'ver': ['2'],
1495 'cpn': [cpn],
1496 })
1497 playback_url = compat_urlparse.urlunparse(
1498 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
1499
1500 self._download_webpage(
1501 playback_url, video_id, 'Marking watched',
1502 'Unable to mark watched', fatal=False)
1503
1504 @staticmethod
1505 def _extract_urls(webpage):
1506 # Embedded YouTube player
1507 entries = [
1508 unescapeHTML(mobj.group('url'))
1509 for mobj in re.finditer(r'''(?x)
1510 (?:
1511 <iframe[^>]+?src=|
1512 data-video-url=|
1513 <embed[^>]+?src=|
1514 embedSWF\(?:\s*|
1515 <object[^>]+data=|
1516 new\s+SWFObject\(
1517 )
1518 (["\'])
1519 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1520 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
1521 \1''', webpage)]
1522
1523 # lazyYT YouTube embed
1524 entries.extend(list(map(
1525 unescapeHTML,
1526 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1527
1528 # Wordpress "YouTube Video Importer" plugin
1529 matches = re.findall(r'''(?x)<div[^>]+
1530 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1531 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1532 entries.extend(m[-1] for m in matches)
1533
1534 return entries
1535
1536 @staticmethod
1537 def _extract_url(webpage):
1538 urls = YoutubeIE._extract_urls(webpage)
1539 return urls[0] if urls else None
1540
1541 @classmethod
1542 def extract_id(cls, url):
1543 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1544 if mobj is None:
1545 raise ExtractorError('Invalid URL: %s' % url)
1546 video_id = mobj.group(2)
1547 return video_id
1548
1549 def _extract_chapters_from_json(self, webpage, video_id, duration):
1550 if not webpage:
1551 return
1552 data = self._extract_yt_initial_data(video_id, webpage)
1553 if not data or not isinstance(data, dict):
1554 return
1555 chapters_list = try_get(
1556 data,
1557 lambda x: x['playerOverlays']
1558 ['playerOverlayRenderer']
1559 ['decoratedPlayerBarRenderer']
1560 ['decoratedPlayerBarRenderer']
1561 ['playerBar']
1562 ['chapteredPlayerBarRenderer']
1563 ['chapters'],
1564 list)
1565 if not chapters_list:
1566 return
1567
1568 def chapter_time(chapter):
1569 return float_or_none(
1570 try_get(
1571 chapter,
1572 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1573 int),
1574 scale=1000)
1575 chapters = []
1576 for next_num, chapter in enumerate(chapters_list, start=1):
1577 start_time = chapter_time(chapter)
1578 if start_time is None:
1579 continue
1580 end_time = (chapter_time(chapters_list[next_num])
1581 if next_num < len(chapters_list) else duration)
1582 if end_time is None:
1583 continue
1584 title = try_get(
1585 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1586 compat_str)
1587 chapters.append({
1588 'start_time': start_time,
1589 'end_time': end_time,
1590 'title': title,
1591 })
1592 return chapters
1593
1594 @staticmethod
1595 def _extract_chapters_from_description(description, duration):
1596 if not description:
1597 return None
1598 chapter_lines = re.findall(
1599 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1600 description)
1601 if not chapter_lines:
1602 return None
1603 chapters = []
1604 for next_num, (chapter_line, time_point) in enumerate(
1605 chapter_lines, start=1):
1606 start_time = parse_duration(time_point)
1607 if start_time is None:
1608 continue
1609 if start_time > duration:
1610 break
1611 end_time = (duration if next_num == len(chapter_lines)
1612 else parse_duration(chapter_lines[next_num][1]))
1613 if end_time is None:
1614 continue
1615 if end_time > duration:
1616 end_time = duration
1617 if start_time > end_time:
1618 break
1619 chapter_title = re.sub(
1620 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1621 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1622 chapters.append({
1623 'start_time': start_time,
1624 'end_time': end_time,
1625 'title': chapter_title,
1626 })
1627 return chapters
1628
1629 def _extract_chapters(self, webpage, description, video_id, duration):
1630 return (self._extract_chapters_from_json(webpage, video_id, duration)
1631 or self._extract_chapters_from_description(description, duration))
1632
1633 def _real_extract(self, url):
1634 url, smuggled_data = unsmuggle_url(url, {})
1635
1636 proto = (
1637 'http' if self._downloader.params.get('prefer_insecure', False)
1638 else 'https')
1639
1640 start_time = None
1641 end_time = None
1642 parsed_url = compat_urllib_parse_urlparse(url)
1643 for component in [parsed_url.fragment, parsed_url.query]:
1644 query = compat_parse_qs(component)
1645 if start_time is None and 't' in query:
1646 start_time = parse_duration(query['t'][0])
1647 if start_time is None and 'start' in query:
1648 start_time = parse_duration(query['start'][0])
1649 if end_time is None and 'end' in query:
1650 end_time = parse_duration(query['end'][0])
1651
1652 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1653 mobj = re.search(self._NEXT_URL_RE, url)
1654 if mobj:
1655 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
1656 video_id = self.extract_id(url)
1657
1658 # Get video webpage
1659 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
1660 video_webpage, urlh = self._download_webpage_handle(url, video_id)
1661
1662 qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query)
1663 video_id = qs.get('v', [None])[0] or video_id
1664
1665 # Attempt to extract SWF player URL
1666 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1667 if mobj is not None:
1668 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1669 else:
1670 player_url = None
1671
1672 dash_mpds = []
1673
1674 def add_dash_mpd(video_info):
1675 dash_mpd = video_info.get('dashmpd')
1676 if dash_mpd and dash_mpd[0] not in dash_mpds:
1677 dash_mpds.append(dash_mpd[0])
1678
1679 def add_dash_mpd_pr(pl_response):
1680 dash_mpd = url_or_none(try_get(
1681 pl_response, lambda x: x['streamingData']['dashManifestUrl'],
1682 compat_str))
1683 if dash_mpd and dash_mpd not in dash_mpds:
1684 dash_mpds.append(dash_mpd)
1685
1686 is_live = None
1687 view_count = None
1688
1689 def extract_view_count(v_info):
1690 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1691
1692 def extract_player_response(player_response, video_id):
1693 pl_response = str_or_none(player_response)
1694 if not pl_response:
1695 return
1696 pl_response = self._parse_json(pl_response, video_id, fatal=False)
1697 if isinstance(pl_response, dict):
1698 add_dash_mpd_pr(pl_response)
1699 return pl_response
1700
1701 def extract_embedded_config(embed_webpage, video_id):
1702 embedded_config = self._search_regex(
1703 r'setConfig\(({.*})\);',
1704 embed_webpage, 'ytInitialData', default=None)
1705 if embedded_config:
1706 return embedded_config
1707
1708 player_response = {}
1709
1710 # Get video info
1711 video_info = {}
1712 embed_webpage = None
1713 if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+'
1714 or re.search(r'player-age-gate-content">', video_webpage) is not None):
1715 cookie_keys = self._get_cookies('https://www.youtube.com').keys()
1716 age_gate = True
1717 # We simulate the access to the video from www.youtube.com/v/{video_id}
1718 # this can be viewed without login into Youtube
1719 url = proto + '://www.youtube.com/embed/%s' % video_id
1720 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
1721 ext = extract_embedded_config(embed_webpage, video_id)
1722 # playabilityStatus = re.search(r'{\\\"status\\\":\\\"(?P<playabilityStatus>[^\"]+)\\\"', ext)
1723 playable_in_embed = re.search(r'{\\\"playableInEmbed\\\":(?P<playableinEmbed>[^\,]+)', ext)
1724 if not playable_in_embed:
1725 self.to_screen('Could not determine whether playabale in embed for video %s' % video_id)
1726 playable_in_embed = ''
1727 else:
1728 playable_in_embed = playable_in_embed.group('playableinEmbed')
1729 # check if video is only playable on youtube in other words not playable in embed - if so it requires auth (cookies)
1730 # if re.search(r'player-unavailable">', embed_webpage) is not None:
1731 if playable_in_embed == 'false':
1732 '''
1733 # TODO apply this patch when Support for Python 2.6(!) and above drops
1734 if ({'VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID'} <= cookie_keys
1735 or {'VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO'} <= cookie_keys):
1736 '''
1737 if (set(('VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID')) <= set(cookie_keys)
1738 or set(('VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO')) <= set(cookie_keys)):
1739 age_gate = False
1740 # Try looking directly into the video webpage
1741 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1742 if ytplayer_config:
1743 args = ytplayer_config.get("args")
1744 if args is not None:
1745 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1746 # Convert to the same format returned by compat_parse_qs
1747 video_info = dict((k, [v]) for k, v in args.items())
1748 add_dash_mpd(video_info)
1749 # Rental video is not rented but preview is available (e.g.
1750 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1751 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1752 if not video_info and args.get('ypc_vid'):
1753 return self.url_result(
1754 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1755 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1756 is_live = True
1757 if not player_response:
1758 player_response = extract_player_response(args.get('player_response'), video_id)
1759 elif not player_response:
1760 player_response = ytplayer_config
1761 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1762 add_dash_mpd_pr(player_response)
1763 else:
1764 raise ExtractorError('Video is age restricted and only playable on Youtube. Requires cookies!', expected=True)
1765 else:
1766 data = compat_urllib_parse_urlencode({
1767 'video_id': video_id,
1768 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1769 'sts': self._search_regex(
1770 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1771 })
1772 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1773 try:
1774 video_info_webpage = self._download_webpage(
1775 video_info_url, video_id,
1776 note='Refetching age-gated info webpage',
1777 errnote='unable to download video info webpage')
1778 except ExtractorError:
1779 video_info_webpage = None
1780 if video_info_webpage:
1781 video_info = compat_parse_qs(video_info_webpage)
1782 pl_response = video_info.get('player_response', [None])[0]
1783 player_response = extract_player_response(pl_response, video_id)
1784 add_dash_mpd(video_info)
1785 view_count = extract_view_count(video_info)
1786 else:
1787 age_gate = False
1788 # Try looking directly into the video webpage
1789 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1790 if ytplayer_config:
1791 args = ytplayer_config.get('args', {})
1792 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1793 # Convert to the same format returned by compat_parse_qs
1794 video_info = dict((k, [v]) for k, v in args.items())
1795 add_dash_mpd(video_info)
1796 # Rental video is not rented but preview is available (e.g.
1797 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1798 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1799 if not video_info and args.get('ypc_vid'):
1800 return self.url_result(
1801 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1802 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1803 is_live = True
1804 if not player_response:
1805 player_response = extract_player_response(args.get('player_response'), video_id)
1806 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1807 add_dash_mpd_pr(player_response)
1808
1809 if not video_info and not player_response:
1810 player_response = extract_player_response(
1811 self._search_regex(
1812 r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;', video_webpage,
1813 'initial player response', default='{}'),
1814 video_id)
1815
1816 def extract_unavailable_message():
1817 messages = []
1818 for tag, kind in (('h1', 'message'), ('div', 'submessage')):
1819 msg = self._html_search_regex(
1820 r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
1821 video_webpage, 'unavailable %s' % kind, default=None)
1822 if msg:
1823 messages.append(msg)
1824 if messages:
1825 return '\n'.join(messages)
1826
1827 if not video_info and not player_response:
1828 unavailable_message = extract_unavailable_message()
1829 if not unavailable_message:
1830 unavailable_message = 'Unable to extract video data'
1831 raise ExtractorError(
1832 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
1833
1834 if not isinstance(video_info, dict):
1835 video_info = {}
1836
1837 video_details = try_get(
1838 player_response, lambda x: x['videoDetails'], dict) or {}
1839
1840 microformat = try_get(
1841 player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}
1842
1843 video_title = video_info.get('title', [None])[0] or video_details.get('title')
1844 if not video_title:
1845 self._downloader.report_warning('Unable to extract video title')
1846 video_title = '_'
1847
1848 description_original = video_description = get_element_by_id("eow-description", video_webpage)
1849 if video_description:
1850
1851 def replace_url(m):
1852 redir_url = compat_urlparse.urljoin(url, m.group(1))
1853 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1854 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1855 qs = compat_parse_qs(parsed_redir_url.query)
1856 q = qs.get('q')
1857 if q and q[0]:
1858 return q[0]
1859 return redir_url
1860
1861 description_original = video_description = re.sub(r'''(?x)
1862 <a\s+
1863 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1864 (?:title|href)="([^"]+)"\s+
1865 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1866 class="[^"]*"[^>]*>
1867 [^<]+\.{3}\s*
1868 </a>
1869 ''', replace_url, video_description)
1870 video_description = clean_html(video_description)
1871 else:
1872 video_description = video_details.get('shortDescription')
1873 if video_description is None:
1874 video_description = self._html_search_meta('description', video_webpage)
1875
1876 if not smuggled_data.get('force_singlefeed', False):
1877 if not self._downloader.params.get('noplaylist'):
1878 multifeed_metadata_list = try_get(
1879 player_response,
1880 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
1881 compat_str) or try_get(
1882 video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
1883 if multifeed_metadata_list:
1884 entries = []
1885 feed_ids = []
1886 for feed in multifeed_metadata_list.split(','):
1887 # Unquote should take place before split on comma (,) since textual
1888 # fields may contain comma as well (see
1889 # https://github.com/ytdl-org/youtube-dl/issues/8536)
1890 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
1891
1892 def feed_entry(name):
1893 return try_get(feed_data, lambda x: x[name][0], compat_str)
1894
1895 feed_id = feed_entry('id')
1896 if not feed_id:
1897 continue
1898 feed_title = feed_entry('title')
1899 title = video_title
1900 if feed_title:
1901 title += ' (%s)' % feed_title
1902 entries.append({
1903 '_type': 'url_transparent',
1904 'ie_key': 'Youtube',
1905 'url': smuggle_url(
1906 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1907 {'force_singlefeed': True}),
1908 'title': title,
1909 })
1910 feed_ids.append(feed_id)
1911 self.to_screen(
1912 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1913 % (', '.join(feed_ids), video_id))
1914 return self.playlist_result(entries, video_id, video_title, video_description)
1915 else:
1916 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1917
1918 if view_count is None:
1919 view_count = extract_view_count(video_info)
1920 if view_count is None and video_details:
1921 view_count = int_or_none(video_details.get('viewCount'))
1922 if view_count is None and microformat:
1923 view_count = int_or_none(microformat.get('viewCount'))
1924
1925 if is_live is None:
1926 is_live = bool_or_none(video_details.get('isLive'))
1927
1928 has_live_chat_replay = False
1929 if not is_live:
1930 yt_initial_data = self._get_yt_initial_data(video_id, video_webpage)
1931 try:
1932 yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
1933 has_live_chat_replay = True
1934 except (KeyError, IndexError, TypeError):
1935 pass
1936
1937 # Check for "rental" videos
1938 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1939 raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
1940
1941 def _extract_filesize(media_url):
1942 return int_or_none(self._search_regex(
1943 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
1944
1945 streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
1946 streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
1947
1948 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1949 self.report_rtmp_download()
1950 formats = [{
1951 'format_id': '_rtmp',
1952 'protocol': 'rtmp',
1953 'url': video_info['conn'][0],
1954 'player_url': player_url,
1955 }]
1956 elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
1957 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
1958 if 'rtmpe%3Dyes' in encoded_url_map:
1959 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
1960 formats = []
1961 formats_spec = {}
1962 fmt_list = video_info.get('fmt_list', [''])[0]
1963 if fmt_list:
1964 for fmt in fmt_list.split(','):
1965 spec = fmt.split('/')
1966 if len(spec) > 1:
1967 width_height = spec[1].split('x')
1968 if len(width_height) == 2:
1969 formats_spec[spec[0]] = {
1970 'resolution': spec[1],
1971 'width': int_or_none(width_height[0]),
1972 'height': int_or_none(width_height[1]),
1973 }
1974 for fmt in streaming_formats:
1975 itag = str_or_none(fmt.get('itag'))
1976 if not itag:
1977 continue
1978 quality = fmt.get('quality')
1979 quality_label = fmt.get('qualityLabel') or quality
1980 formats_spec[itag] = {
1981 'asr': int_or_none(fmt.get('audioSampleRate')),
1982 'filesize': int_or_none(fmt.get('contentLength')),
1983 'format_note': quality_label,
1984 'fps': int_or_none(fmt.get('fps')),
1985 'height': int_or_none(fmt.get('height')),
1986 # bitrate for itag 43 is always 2147483647
1987 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
1988 'width': int_or_none(fmt.get('width')),
1989 }
1990
1991 for fmt in streaming_formats:
1992 if fmt.get('drmFamilies') or fmt.get('drm_families'):
1993 continue
1994 url = url_or_none(fmt.get('url'))
1995
1996 if not url:
1997 cipher = fmt.get('cipher') or fmt.get('signatureCipher')
1998 if not cipher:
1999 continue
2000 url_data = compat_parse_qs(cipher)
2001 url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
2002 if not url:
2003 continue
2004 else:
2005 cipher = None
2006 url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2007
2008 stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
2009 # Unsupported FORMAT_STREAM_TYPE_OTF
2010 if stream_type == 3:
2011 continue
2012
2013 format_id = fmt.get('itag') or url_data['itag'][0]
2014 if not format_id:
2015 continue
2016 format_id = compat_str(format_id)
2017
2018 if cipher:
2019 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
2020 ASSETS_RE = (
2021 r'<script[^>]+\bsrc=("[^"]+")[^>]+\bname=["\']player_ias/base',
2022 r'"jsUrl"\s*:\s*("[^"]+")',
2023 r'"assets":.+?"js":\s*("[^"]+")')
2024 jsplayer_url_json = self._search_regex(
2025 ASSETS_RE,
2026 embed_webpage if age_gate else video_webpage,
2027 'JS player URL (1)', default=None)
2028 if not jsplayer_url_json and not age_gate:
2029 # We need the embed website after all
2030 if embed_webpage is None:
2031 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
2032 embed_webpage = self._download_webpage(
2033 embed_url, video_id, 'Downloading embed webpage')
2034 jsplayer_url_json = self._search_regex(
2035 ASSETS_RE, embed_webpage, 'JS player URL')
2036
2037 player_url = json.loads(jsplayer_url_json)
2038 if player_url is None:
2039 player_url_json = self._search_regex(
2040 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
2041 video_webpage, 'age gate player URL')
2042 player_url = json.loads(player_url_json)
2043
2044 if 'sig' in url_data:
2045 url += '&signature=' + url_data['sig'][0]
2046 elif 's' in url_data:
2047 encrypted_sig = url_data['s'][0]
2048
2049 if self._downloader.params.get('verbose'):
2050 if player_url is None:
2051 player_desc = 'unknown'
2052 else:
2053 player_type, player_version = self._extract_player_info(player_url)
2054 player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version)
2055 parts_sizes = self._signature_cache_id(encrypted_sig)
2056 self.to_screen('{%s} signature length %s, %s' %
2057 (format_id, parts_sizes, player_desc))
2058
2059 signature = self._decrypt_signature(
2060 encrypted_sig, video_id, player_url, age_gate)
2061 sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
2062 url += '&%s=%s' % (sp, signature)
2063 if 'ratebypass' not in url:
2064 url += '&ratebypass=yes'
2065
2066 dct = {
2067 'format_id': format_id,
2068 'url': url,
2069 'player_url': player_url,
2070 }
2071 if format_id in self._formats:
2072 dct.update(self._formats[format_id])
2073 if format_id in formats_spec:
2074 dct.update(formats_spec[format_id])
2075
2076 # Some itags are not included in DASH manifest thus corresponding formats will
2077 # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).
2078 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
2079 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
2080 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
2081
2082 if width is None:
2083 width = int_or_none(fmt.get('width'))
2084 if height is None:
2085 height = int_or_none(fmt.get('height'))
2086
2087 filesize = int_or_none(url_data.get(
2088 'clen', [None])[0]) or _extract_filesize(url)
2089
2090 quality = url_data.get('quality', [None])[0] or fmt.get('quality')
2091 quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
2092
2093 tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
2094 or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
2095 fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
2096
2097 more_fields = {
2098 'filesize': filesize,
2099 'tbr': tbr,
2100 'width': width,
2101 'height': height,
2102 'fps': fps,
2103 'format_note': quality_label or quality,
2104 }
2105 for key, value in more_fields.items():
2106 if value:
2107 dct[key] = value
2108 type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
2109 if type_:
2110 type_split = type_.split(';')
2111 kind_ext = type_split[0].split('/')
2112 if len(kind_ext) == 2:
2113 kind, _ = kind_ext
2114 dct['ext'] = mimetype2ext(type_split[0])
2115 if kind in ('audio', 'video'):
2116 codecs = None
2117 for mobj in re.finditer(
2118 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
2119 if mobj.group('key') == 'codecs':
2120 codecs = mobj.group('val')
2121 break
2122 if codecs:
2123 dct.update(parse_codecs(codecs))
2124 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
2125 dct['downloader_options'] = {
2126 # Youtube throttles chunks >~10M
2127 'http_chunk_size': 10485760,
2128 }
2129 formats.append(dct)
2130 else:
2131 manifest_url = (
2132 url_or_none(try_get(
2133 player_response,
2134 lambda x: x['streamingData']['hlsManifestUrl'],
2135 compat_str))
2136 or url_or_none(try_get(
2137 video_info, lambda x: x['hlsvp'][0], compat_str)))
2138 if manifest_url:
2139 formats = []
2140 m3u8_formats = self._extract_m3u8_formats(
2141 manifest_url, video_id, 'mp4', fatal=False)
2142 for a_format in m3u8_formats:
2143 itag = self._search_regex(
2144 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
2145 if itag:
2146 a_format['format_id'] = itag
2147 if itag in self._formats:
2148 dct = self._formats[itag].copy()
2149 dct.update(a_format)
2150 a_format = dct
2151 a_format['player_url'] = player_url
2152 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
2153 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
2154 if self._downloader.params.get('youtube_include_hls_manifest', True):
2155 formats.append(a_format)
2156 else:
2157 error_message = extract_unavailable_message()
2158 if not error_message:
2159 reason_list = try_get(
2160 player_response,
2161 lambda x: x['playabilityStatus']['errorScreen']['playerErrorMessageRenderer']['subreason']['runs'],
2162 list) or []
2163 for reason in reason_list:
2164 if not isinstance(reason, dict):
2165 continue
2166 reason_text = try_get(reason, lambda x: x['text'], compat_str)
2167 if reason_text:
2168 if not error_message:
2169 error_message = ''
2170 error_message += reason_text
2171 if error_message:
2172 error_message = clean_html(error_message)
2173 if not error_message:
2174 error_message = clean_html(try_get(
2175 player_response, lambda x: x['playabilityStatus']['reason'],
2176 compat_str))
2177 if not error_message:
2178 error_message = clean_html(
2179 try_get(video_info, lambda x: x['reason'][0], compat_str))
2180 if error_message:
2181 raise ExtractorError(error_message, expected=True)
2182 raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
2183
2184 # uploader
2185 video_uploader = try_get(
2186 video_info, lambda x: x['author'][0],
2187 compat_str) or str_or_none(video_details.get('author'))
2188 if video_uploader:
2189 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
2190 else:
2191 self._downloader.report_warning('unable to extract uploader name')
2192
2193 # uploader_id
2194 video_uploader_id = None
2195 video_uploader_url = None
2196 mobj = re.search(
2197 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
2198 video_webpage)
2199 if mobj is not None:
2200 video_uploader_id = mobj.group('uploader_id')
2201 video_uploader_url = mobj.group('uploader_url')
2202 else:
2203 owner_profile_url = url_or_none(microformat.get('ownerProfileUrl'))
2204 if owner_profile_url:
2205 video_uploader_id = self._search_regex(
2206 r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id',
2207 default=None)
2208 video_uploader_url = owner_profile_url
2209
2210 channel_id = (
2211 str_or_none(video_details.get('channelId'))
2212 or self._html_search_meta(
2213 'channelId', video_webpage, 'channel id', default=None)
2214 or self._search_regex(
2215 r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
2216 video_webpage, 'channel id', default=None, group='id'))
2217 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
2218
2219 thumbnails = []
2220 thumbnails_list = try_get(
2221 video_details, lambda x: x['thumbnail']['thumbnails'], list) or []
2222 for t in thumbnails_list:
2223 if not isinstance(t, dict):
2224 continue
2225 thumbnail_url = url_or_none(t.get('url'))
2226 if not thumbnail_url:
2227 continue
2228 thumbnails.append({
2229 'url': thumbnail_url,
2230 'width': int_or_none(t.get('width')),
2231 'height': int_or_none(t.get('height')),
2232 })
2233
2234 if not thumbnails:
2235 video_thumbnail = None
2236 # We try first to get a high quality image:
2237 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
2238 video_webpage, re.DOTALL)
2239 if m_thumb is not None:
2240 video_thumbnail = m_thumb.group(1)
2241 thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str)
2242 if thumbnail_url:
2243 video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url)
2244 if video_thumbnail:
2245 thumbnails.append({'url': video_thumbnail})
2246
2247 # upload date
2248 upload_date = self._html_search_meta(
2249 'datePublished', video_webpage, 'upload date', default=None)
2250 if not upload_date:
2251 upload_date = self._search_regex(
2252 [r'(?s)id="eow-date.*?>(.*?)</span>',
2253 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
2254 video_webpage, 'upload date', default=None)
2255 if not upload_date:
2256 upload_date = microformat.get('publishDate') or microformat.get('uploadDate')
2257 upload_date = unified_strdate(upload_date)
2258
2259 video_license = self._html_search_regex(
2260 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
2261 video_webpage, 'license', default=None)
2262
2263 m_music = re.search(
2264 r'''(?x)
2265 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
2266 <ul[^>]*>\s*
2267 <li>(?P<title>.+?)
2268 by (?P<creator>.+?)
2269 (?:
2270 \(.+?\)|
2271 <a[^>]*
2272 (?:
2273 \bhref=["\']/red[^>]*>| # drop possible
2274 >\s*Listen ad-free with YouTube Red # YouTube Red ad
2275 )
2276 .*?
2277 )?</li
2278 ''',
2279 video_webpage)
2280 if m_music:
2281 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
2282 video_creator = clean_html(m_music.group('creator'))
2283 else:
2284 video_alt_title = video_creator = None
2285
2286 def extract_meta(field):
2287 return self._html_search_regex(
2288 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
2289 video_webpage, field, default=None)
2290
2291 track = extract_meta('Song')
2292 artist = extract_meta('Artist')
2293 album = extract_meta('Album')
2294
2295 # Youtube Music Auto-generated description
2296 release_date = release_year = None
2297 if video_description:
2298 mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description)
2299 if mobj:
2300 if not track:
2301 track = mobj.group('track').strip()
2302 if not artist:
2303 artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))
2304 if not album:
2305 album = mobj.group('album'.strip())
2306 release_year = mobj.group('release_year')
2307 release_date = mobj.group('release_date')
2308 if release_date:
2309 release_date = release_date.replace('-', '')
2310 if not release_year:
2311 release_year = int(release_date[:4])
2312 if release_year:
2313 release_year = int(release_year)
2314
2315 yt_initial = self._get_yt_initial_data(video_id, video_webpage)
2316 if yt_initial:
2317 music_metadata = self._get_music_metadata_from_yt_initial(yt_initial)
2318 if len(music_metadata):
2319 album = music_metadata[0].get('album')
2320 artist = music_metadata[0].get('artist')
2321 track = music_metadata[0].get('track')
2322
2323 m_episode = re.search(
2324 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
2325 video_webpage)
2326 if m_episode:
2327 series = unescapeHTML(m_episode.group('series'))
2328 season_number = int(m_episode.group('season'))
2329 episode_number = int(m_episode.group('episode'))
2330 else:
2331 series = season_number = episode_number = None
2332
2333 m_cat_container = self._search_regex(
2334 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
2335 video_webpage, 'categories', default=None)
2336 category = None
2337 if m_cat_container:
2338 category = self._html_search_regex(
2339 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
2340 default=None)
2341 if not category:
2342 category = try_get(
2343 microformat, lambda x: x['category'], compat_str)
2344 video_categories = None if category is None else [category]
2345
2346 video_tags = [
2347 unescapeHTML(m.group('content'))
2348 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
2349 if not video_tags:
2350 video_tags = try_get(video_details, lambda x: x['keywords'], list)
2351
2352 def _extract_count(count_name):
2353 return str_to_int(self._search_regex(
2354 (r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' % re.escape(count_name),
2355 r'["\']label["\']\s*:\s*["\']([\d,.]+)\s+%ss["\']' % re.escape(count_name)),
2356 video_webpage, count_name, default=None))
2357
2358 like_count = _extract_count('like')
2359 dislike_count = _extract_count('dislike')
2360
2361 if view_count is None:
2362 view_count = str_to_int(self._search_regex(
2363 r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
2364 'view count', default=None))
2365
2366 average_rating = (
2367 float_or_none(video_details.get('averageRating'))
2368 or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
2369
2370 # subtitles
2371 video_subtitles = self.extract_subtitles(
2372 video_id, video_webpage, has_live_chat_replay)
2373 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
2374
2375 video_duration = try_get(
2376 video_info, lambda x: int_or_none(x['length_seconds'][0]))
2377 if not video_duration:
2378 video_duration = int_or_none(video_details.get('lengthSeconds'))
2379 if not video_duration:
2380 video_duration = parse_duration(self._html_search_meta(
2381 'duration', video_webpage, 'video duration'))
2382
2383 # Get Subscriber Count of channel
2384 subscriber_count = parse_count(self._search_regex(
2385 r'"text":"([\d\.]+\w?) subscribers"',
2386 video_webpage,
2387 'subscriber count',
2388 default=None
2389 ))
2390
2391 # annotations
2392 video_annotations = None
2393 if self._downloader.params.get('writeannotations', False):
2394 xsrf_token = self._search_regex(
2395 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2',
2396 video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
2397 invideo_url = try_get(
2398 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
2399 if xsrf_token and invideo_url:
2400 xsrf_field_name = self._search_regex(
2401 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
2402 video_webpage, 'xsrf field name',
2403 group='xsrf_field_name', default='session_token')
2404 video_annotations = self._download_webpage(
2405 self._proto_relative_url(invideo_url),
2406 video_id, note='Downloading annotations',
2407 errnote='Unable to download video annotations', fatal=False,
2408 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
2409
2410 chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
2411
2412 # Look for the DASH manifest
2413 if self._downloader.params.get('youtube_include_dash_manifest', True):
2414 dash_mpd_fatal = True
2415 for mpd_url in dash_mpds:
2416 dash_formats = {}
2417 try:
2418 def decrypt_sig(mobj):
2419 s = mobj.group(1)
2420 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2421 return '/signature/%s' % dec_s
2422
2423 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2424
2425 for df in self._extract_mpd_formats(
2426 mpd_url, video_id, fatal=dash_mpd_fatal,
2427 formats_dict=self._formats):
2428 if not df.get('filesize'):
2429 df['filesize'] = _extract_filesize(df['url'])
2430 # Do not overwrite DASH format found in some previous DASH manifest
2431 if df['format_id'] not in dash_formats:
2432 dash_formats[df['format_id']] = df
2433 # Additional DASH manifests may end up in HTTP Error 403 therefore
2434 # allow them to fail without bug report message if we already have
2435 # some DASH manifest succeeded. This is temporary workaround to reduce
2436 # burst of bug reports until we figure out the reason and whether it
2437 # can be fixed at all.
2438 dash_mpd_fatal = False
2439 except (ExtractorError, KeyError) as e:
2440 self.report_warning(
2441 'Skipping DASH manifest: %r' % e, video_id)
2442 if dash_formats:
2443 # Remove the formats we found through non-DASH, they
2444 # contain less info and it can be wrong, because we use
2445 # fixed values (for example the resolution). See
2446 # https://github.com/ytdl-org/youtube-dl/issues/5774 for an
2447 # example.
2448 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
2449 formats.extend(dash_formats.values())
2450
2451 # Check for malformed aspect ratio
2452 stretched_m = re.search(
2453 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2454 video_webpage)
2455 if stretched_m:
2456 w = float(stretched_m.group('w'))
2457 h = float(stretched_m.group('h'))
2458 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2459 # We will only process correct ratios.
2460 if w > 0 and h > 0:
2461 ratio = w / h
2462 for f in formats:
2463 if f.get('vcodec') != 'none':
2464 f['stretched_ratio'] = ratio
2465
2466 if not formats:
2467 if 'reason' in video_info:
2468 if 'The uploader has not made this video available in your country.' in video_info['reason']:
2469 regions_allowed = self._html_search_meta(
2470 'regionsAllowed', video_webpage, default=None)
2471 countries = regions_allowed.split(',') if regions_allowed else None
2472 self.raise_geo_restricted(
2473 msg=video_info['reason'][0], countries=countries)
2474 reason = video_info['reason'][0]
2475 if 'Invalid parameters' in reason:
2476 unavailable_message = extract_unavailable_message()
2477 if unavailable_message:
2478 reason = unavailable_message
2479 raise ExtractorError(
2480 'YouTube said: %s' % reason,
2481 expected=True, video_id=video_id)
2482 if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']):
2483 raise ExtractorError('This video is DRM protected.', expected=True)
2484
2485 self._sort_formats(formats)
2486
2487 self.mark_watched(video_id, video_info, player_response)
2488
2489 return {
2490 'id': video_id,
2491 'uploader': video_uploader,
2492 'uploader_id': video_uploader_id,
2493 'uploader_url': video_uploader_url,
2494 'channel_id': channel_id,
2495 'channel_url': channel_url,
2496 'upload_date': upload_date,
2497 'license': video_license,
2498 'creator': video_creator or artist,
2499 'title': video_title,
2500 'alt_title': video_alt_title or track,
2501 'thumbnails': thumbnails,
2502 'description': video_description,
2503 'categories': video_categories,
2504 'tags': video_tags,
2505 'subtitles': video_subtitles,
2506 'automatic_captions': automatic_captions,
2507 'duration': video_duration,
2508 'age_limit': 18 if age_gate else 0,
2509 'annotations': video_annotations,
2510 'chapters': chapters,
2511 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
2512 'view_count': view_count,
2513 'like_count': like_count,
2514 'dislike_count': dislike_count,
2515 'average_rating': average_rating,
2516 'formats': formats,
2517 'is_live': is_live,
2518 'start_time': start_time,
2519 'end_time': end_time,
2520 'series': series,
2521 'season_number': season_number,
2522 'episode_number': episode_number,
2523 'track': track,
2524 'artist': artist,
2525 'album': album,
2526 'release_date': release_date,
2527 'release_year': release_year,
2528 'subscriber_count': subscriber_count,
2529 }
2530
2531
2532class YoutubeTabIE(YoutubeBaseInfoExtractor):
2533 IE_DESC = 'YouTube.com tab'
2534 _VALID_URL = r'''(?x)
2535 https?://
2536 (?:\w+\.)?
2537 (?:
2538 youtube(?:kids)?\.com|
2539 invidio\.us
2540 )/
2541 (?:
2542 (?:channel|c|user)/|
2543 (?P<not_channel>
2544 (?:playlist|watch)\?.*?\blist=
2545 )|
2546 (?!(%s)([/#?]|$)) # Direct URLs
2547 )
2548 (?P<id>[^/?\#&]+)
2549 ''' % YoutubeBaseInfoExtractor._RESERVED_NAMES
2550 IE_NAME = 'youtube:tab'
2551
2552 _TESTS = [{
2553 # playlists, multipage
2554 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid',
2555 'playlist_mincount': 94,
2556 'info_dict': {
2557 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2558 'title': 'Игорь Клейнер - Playlists',
2559 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
2560 },
2561 }, {
2562 # playlists, multipage, different order
2563 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
2564 'playlist_mincount': 94,
2565 'info_dict': {
2566 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2567 'title': 'Игорь Клейнер - Playlists',
2568 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
2569 },
2570 }, {
2571 # playlists, singlepage
2572 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
2573 'playlist_mincount': 4,
2574 'info_dict': {
2575 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
2576 'title': 'ThirstForScience - Playlists',
2577 'description': 'md5:609399d937ea957b0f53cbffb747a14c',
2578 }
2579 }, {
2580 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
2581 'only_matching': True,
2582 }, {
2583 # basic, single video playlist
2584 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2585 'info_dict': {
2586 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2587 'uploader': 'Sergey M.',
2588 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2589 'title': 'youtube-dl public playlist',
2590 },
2591 'playlist_count': 1,
2592 }, {
2593 # empty playlist
2594 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2595 'info_dict': {
2596 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2597 'uploader': 'Sergey M.',
2598 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2599 'title': 'youtube-dl empty playlist',
2600 },
2601 'playlist_count': 0,
2602 }, {
2603 # Home tab
2604 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured',
2605 'info_dict': {
2606 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2607 'title': 'lex will - Home',
2608 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2609 },
2610 'playlist_mincount': 2,
2611 }, {
2612 # Videos tab
2613 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos',
2614 'info_dict': {
2615 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2616 'title': 'lex will - Videos',
2617 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2618 },
2619 'playlist_mincount': 975,
2620 }, {
2621 # Videos tab, sorted by popular
2622 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid',
2623 'info_dict': {
2624 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2625 'title': 'lex will - Videos',
2626 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2627 },
2628 'playlist_mincount': 199,
2629 }, {
2630 # Playlists tab
2631 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists',
2632 'info_dict': {
2633 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2634 'title': 'lex will - Playlists',
2635 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2636 },
2637 'playlist_mincount': 17,
2638 }, {
2639 # Community tab
2640 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community',
2641 'info_dict': {
2642 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2643 'title': 'lex will - Community',
2644 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2645 },
2646 'playlist_mincount': 18,
2647 }, {
2648 # Channels tab
2649 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels',
2650 'info_dict': {
2651 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2652 'title': 'lex will - Channels',
2653 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2654 },
2655 'playlist_mincount': 138,
2656 }, {
2657 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA',
2658 'only_matching': True,
2659 }, {
2660 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
2661 'only_matching': True,
2662 }, {
2663 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
2664 'only_matching': True,
2665 }, {
2666 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2667 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2668 'info_dict': {
2669 'title': '29C3: Not my department',
2670 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2671 'uploader': 'Christiaan008',
2672 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
2673 },
2674 'playlist_count': 96,
2675 }, {
2676 'note': 'Large playlist',
2677 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
2678 'info_dict': {
2679 'title': 'Uploads from Cauchemar',
2680 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
2681 'uploader': 'Cauchemar',
2682 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
2683 },
2684 'playlist_mincount': 1123,
2685 }, {
2686 # even larger playlist, 8832 videos
2687 'url': 'http://www.youtube.com/user/NASAgovVideo/videos',
2688 'only_matching': True,
2689 }, {
2690 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2691 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2692 'info_dict': {
2693 'title': 'Uploads from Interstellar Movie',
2694 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
2695 'uploader': 'Interstellar Movie',
2696 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
2697 },
2698 'playlist_mincount': 21,
2699 }, {
2700 # https://github.com/ytdl-org/youtube-dl/issues/21844
2701 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2702 'info_dict': {
2703 'title': 'Data Analysis with Dr Mike Pound',
2704 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2705 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
2706 'uploader': 'Computerphile',
2707 },
2708 'playlist_mincount': 11,
2709 }, {
2710 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2711 'only_matching': True,
2712 }, {
2713 # Playlist URL that does not actually serve a playlist
2714 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2715 'info_dict': {
2716 'id': 'FqZTN594JQw',
2717 'ext': 'webm',
2718 'title': "Smiley's People 01 detective, Adventure Series, Action",
2719 'uploader': 'STREEM',
2720 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
2721 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
2722 'upload_date': '20150526',
2723 'license': 'Standard YouTube License',
2724 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2725 'categories': ['People & Blogs'],
2726 'tags': list,
2727 'view_count': int,
2728 'like_count': int,
2729 'dislike_count': int,
2730 },
2731 'params': {
2732 'skip_download': True,
2733 },
2734 'skip': 'This video is not available.',
2735 'add_ie': [YoutubeIE.ie_key()],
2736 }, {
2737 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
2738 'only_matching': True,
2739 }, {
2740 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
2741 'only_matching': True,
2742 }, {
2743 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live',
2744 'info_dict': {
2745 'id': '9Auq9mYxFEE',
2746 'ext': 'mp4',
2747 'title': 'Watch Sky News live',
2748 'uploader': 'Sky News',
2749 'uploader_id': 'skynews',
2750 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews',
2751 'upload_date': '20191102',
2752 'description': 'md5:78de4e1c2359d0ea3ed829678e38b662',
2753 'categories': ['News & Politics'],
2754 'tags': list,
2755 'like_count': int,
2756 'dislike_count': int,
2757 },
2758 'params': {
2759 'skip_download': True,
2760 },
2761 }, {
2762 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
2763 'info_dict': {
2764 'id': 'a48o2S1cPoo',
2765 'ext': 'mp4',
2766 'title': 'The Young Turks - Live Main Show',
2767 'uploader': 'The Young Turks',
2768 'uploader_id': 'TheYoungTurks',
2769 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
2770 'upload_date': '20150715',
2771 'license': 'Standard YouTube License',
2772 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
2773 'categories': ['News & Politics'],
2774 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
2775 'like_count': int,
2776 'dislike_count': int,
2777 },
2778 'params': {
2779 'skip_download': True,
2780 },
2781 'only_matching': True,
2782 }, {
2783 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
2784 'only_matching': True,
2785 }, {
2786 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
2787 'only_matching': True,
2788 },
2789 # TODO
2790 # {
2791 # 'url': 'https://www.youtube.com/TheYoungTurks/live',
2792 # 'only_matching': True,
2793 # }
2794 ]
2795
2796 def _extract_channel_id(self, webpage):
2797 channel_id = self._html_search_meta(
2798 'channelId', webpage, 'channel id', default=None)
2799 if channel_id:
2800 return channel_id
2801 channel_url = self._html_search_meta(
2802 ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url',
2803 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad',
2804 'twitter:app:url:googleplay'), webpage, 'channel url')
2805 return self._search_regex(
2806 r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+',
2807 channel_url, 'channel id')
2808
2809 @staticmethod
2810 def _extract_grid_item_renderer(item):
2811 for item_kind in ('Playlist', 'Video', 'Channel'):
2812 renderer = item.get('grid%sRenderer' % item_kind)
2813 if renderer:
2814 return renderer
2815
2816 def _extract_video(self, renderer):
2817 video_id = renderer.get('videoId')
2818 title = try_get(
2819 renderer,
2820 (lambda x: x['title']['runs'][0]['text'],
2821 lambda x: x['title']['simpleText']), compat_str)
2822 description = try_get(
2823 renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'],
2824 compat_str)
2825 duration = parse_duration(try_get(
2826 renderer, lambda x: x['lengthText']['simpleText'], compat_str))
2827 view_count_text = try_get(
2828 renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
2829 view_count = str_to_int(self._search_regex(
2830 r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
2831 'view count', default=None))
2832 uploader = try_get(
2833 renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
2834 return {
2835 '_type': 'url_transparent',
2836 'ie_key': YoutubeIE.ie_key(),
2837 'id': video_id,
2838 'url': video_id,
2839 'title': title,
2840 'description': description,
2841 'duration': duration,
2842 'view_count': view_count,
2843 'uploader': uploader,
2844 }
2845
2846 def _grid_entries(self, grid_renderer):
2847 for item in grid_renderer['items']:
2848 if not isinstance(item, dict):
2849 continue
2850 renderer = self._extract_grid_item_renderer(item)
2851 if not isinstance(renderer, dict):
2852 continue
2853 title = try_get(
2854 renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
2855 # playlist
2856 playlist_id = renderer.get('playlistId')
2857 if playlist_id:
2858 yield self.url_result(
2859 'https://www.youtube.com/playlist?list=%s' % playlist_id,
2860 ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
2861 video_title=title)
2862 # video
2863 video_id = renderer.get('videoId')
2864 if video_id:
2865 yield self._extract_video(renderer)
2866 # channel
2867 channel_id = renderer.get('channelId')
2868 if channel_id:
2869 title = try_get(
2870 renderer, lambda x: x['title']['simpleText'], compat_str)
2871 yield self.url_result(
2872 'https://www.youtube.com/channel/%s' % channel_id,
2873 ie=YoutubeTabIE.ie_key(), video_title=title)
2874
2875 def _shelf_entries_trimmed(self, shelf_renderer):
2876 renderer = try_get(
2877 shelf_renderer, lambda x: x['content']['horizontalListRenderer'], dict)
2878 if not renderer:
2879 return
2880 # TODO: add support for nested playlists so each shelf is processed
2881 # as separate playlist
2882 # TODO: this includes only first N items
2883 for entry in self._grid_entries(renderer):
2884 yield entry
2885
2886 def _shelf_entries(self, shelf_renderer):
2887 ep = try_get(
2888 shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
2889 compat_str)
2890 shelf_url = urljoin('https://www.youtube.com', ep)
2891 if not shelf_url:
2892 return
2893 title = try_get(
2894 shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
2895 yield self.url_result(shelf_url, video_title=title)
2896
2897 def _playlist_entries(self, video_list_renderer):
2898 for content in video_list_renderer['contents']:
2899 if not isinstance(content, dict):
2900 continue
2901 renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer')
2902 if not isinstance(renderer, dict):
2903 continue
2904 video_id = renderer.get('videoId')
2905 if not video_id:
2906 continue
2907 yield self._extract_video(renderer)
2908
2909 def _itemSection_entries(self, item_sect_renderer):
2910 for content in item_sect_renderer['contents']:
2911 if not isinstance(content, dict):
2912 continue
2913 renderer = content.get('videoRenderer', {})
2914 if not isinstance(renderer, dict):
2915 continue
2916 video_id = renderer.get('videoId')
2917 if not video_id:
2918 continue
2919 yield self._extract_video(renderer)
2920
2921 def _rich_entries(self, rich_grid_renderer):
2922 renderer = try_get(
2923 rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) or {}
2924 video_id = renderer.get('videoId')
2925 if not video_id:
2926 return
2927 yield self._extract_video(renderer)
2928
2929 def _video_entry(self, video_renderer):
2930 video_id = video_renderer.get('videoId')
2931 if video_id:
2932 return self._extract_video(video_renderer)
2933
2934 def _post_thread_entries(self, post_thread_renderer):
2935 post_renderer = try_get(
2936 post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict)
2937 if not post_renderer:
2938 return
2939 # video attachment
2940 video_renderer = try_get(
2941 post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict)
2942 video_id = None
2943 if video_renderer:
2944 entry = self._video_entry(video_renderer)
2945 if entry:
2946 yield entry
2947 # inline video links
2948 runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or []
2949 for run in runs:
2950 if not isinstance(run, dict):
2951 continue
2952 ep_url = try_get(
2953 run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str)
2954 if not ep_url:
2955 continue
2956 if not YoutubeIE.suitable(ep_url):
2957 continue
2958 ep_video_id = YoutubeIE._match_id(ep_url)
2959 if video_id == ep_video_id:
2960 continue
2961 yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=video_id)
2962
2963 def _post_thread_continuation_entries(self, post_thread_continuation):
2964 contents = post_thread_continuation.get('contents')
2965 if not isinstance(contents, list):
2966 return
2967 for content in contents:
2968 renderer = content.get('backstagePostThreadRenderer')
2969 if not isinstance(renderer, dict):
2970 continue
2971 for entry in self._post_thread_entries(renderer):
2972 yield entry
2973
2974 @staticmethod
2975 def _extract_next_continuation_data(renderer):
2976 next_continuation = try_get(
2977 renderer, lambda x: x['continuations'][0]['nextContinuationData'], dict)
2978 if not next_continuation:
2979 return
2980 continuation = next_continuation.get('continuation')
2981 if not continuation:
2982 return
2983 ctp = next_continuation.get('clickTrackingParams')
2984 return {
2985 'ctoken': continuation,
2986 'continuation': continuation,
2987 'itct': ctp,
2988 }
2989
2990 @classmethod
2991 def _extract_continuation(cls, renderer):
2992 next_continuation = cls._extract_next_continuation_data(renderer)
2993 if next_continuation:
2994 return next_continuation
2995 contents = renderer.get('contents')
2996 if not isinstance(contents, list):
2997 return
2998 for content in contents:
2999 if not isinstance(content, dict):
3000 continue
3001 continuation_ep = try_get(
3002 content, lambda x: x['continuationItemRenderer']['continuationEndpoint'],
3003 dict)
3004 if not continuation_ep:
3005 continue
3006 continuation = try_get(
3007 continuation_ep, lambda x: x['continuationCommand']['token'], compat_str)
3008 if not continuation:
3009 continue
3010 ctp = continuation_ep.get('clickTrackingParams')
3011 if not ctp:
3012 continue
3013 return {
3014 'ctoken': continuation,
3015 'continuation': continuation,
3016 'itct': ctp,
3017 }
3018
3019 def _entries(self, tab, identity_token):
3020
3021 def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds
3022 contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
3023 for content in contents:
3024 if not isinstance(content, dict):
3025 continue
3026 is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict)
3027 if not is_renderer:
3028 renderer = content.get('richItemRenderer')
3029 if renderer:
3030 for entry in self._rich_entries(renderer):
3031 yield entry
3032 continuation_list[0] = self._extract_continuation(parent_renderer)
3033 continue
3034 isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
3035 for isr_content in isr_contents:
3036 if not isinstance(isr_content, dict):
3037 continue
3038 renderer = isr_content.get('playlistVideoListRenderer')
3039 if renderer:
3040 for entry in self._playlist_entries(renderer):
3041 yield entry
3042 continuation_list[0] = self._extract_continuation(renderer)
3043 continue
3044 renderer = isr_content.get('gridRenderer')
3045 if renderer:
3046 for entry in self._grid_entries(renderer):
3047 yield entry
3048 continuation_list[0] = self._extract_continuation(renderer)
3049 continue
3050 renderer = isr_content.get('shelfRenderer')
3051 if renderer:
3052 for entry in self._shelf_entries(renderer):
3053 yield entry
3054 continue
3055 renderer = isr_content.get('backstagePostThreadRenderer')
3056 if renderer:
3057 for entry in self._post_thread_entries(renderer):
3058 yield entry
3059 continuation_list[0] = self._extract_continuation(renderer)
3060 continue
3061 renderer = isr_content.get('videoRenderer')
3062 if renderer:
3063 entry = self._video_entry(renderer)
3064 if entry:
3065 yield entry
3066
3067 if not continuation_list[0]:
3068 continuation_list[0] = self._extract_continuation(is_renderer)
3069
3070 if not continuation_list[0]:
3071 continuation_list[0] = self._extract_continuation(parent_renderer)
3072
3073 continuation_list = [None] # Python 2 doesnot support nonlocal
3074 parent_renderer = (
3075 try_get(tab, lambda x: x['sectionListRenderer'], dict)
3076 or try_get(tab, lambda x: x['richGridRenderer'], dict) or {})
3077 for entry in extract_entries(parent_renderer):
3078 yield entry
3079 continuation = continuation_list[0]
3080
3081 headers = {
3082 'x-youtube-client-name': '1',
3083 'x-youtube-client-version': '2.20201112.04.01',
3084 }
3085 if identity_token:
3086 headers['x-youtube-identity-token'] = identity_token
3087
3088 for page_num in itertools.count(1):
3089 if not continuation:
3090 break
3091 browse = self._download_json(
3092 'https://www.youtube.com/browse_ajax', None,
3093 'Downloading page %d' % page_num,
3094 headers=headers, query=continuation, fatal=False)
3095 if not browse:
3096 break
3097 response = try_get(browse, lambda x: x[1]['response'], dict)
3098 if not response:
3099 break
3100
3101 continuation_contents = try_get(
3102 response, lambda x: x['continuationContents'], dict)
3103 if continuation_contents:
3104 continuation_renderer = continuation_contents.get('playlistVideoListContinuation')
3105 if continuation_renderer:
3106 for entry in self._playlist_entries(continuation_renderer):
3107 yield entry
3108 continuation = self._extract_continuation(continuation_renderer)
3109 continue
3110 continuation_renderer = continuation_contents.get('gridContinuation')
3111 if continuation_renderer:
3112 for entry in self._grid_entries(continuation_renderer):
3113 yield entry
3114 continuation = self._extract_continuation(continuation_renderer)
3115 continue
3116 continuation_renderer = continuation_contents.get('itemSectionContinuation')
3117 if continuation_renderer:
3118 for entry in self._post_thread_continuation_entries(continuation_renderer):
3119 yield entry
3120 continuation = self._extract_continuation(continuation_renderer)
3121 continue
3122 continuation_renderer = continuation_contents.get('sectionListContinuation') # for feeds
3123 if continuation_renderer:
3124 continuation_list = [None]
3125 for entry in extract_entries(continuation_renderer):
3126 yield entry
3127 continuation = continuation_list[0]
3128 continue
3129
3130 continuation_items = try_get(
3131 response, lambda x: x['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems'], list)
3132 if continuation_items:
3133 continuation_item = continuation_items[0]
3134 if not isinstance(continuation_item, dict):
3135 continue
3136 renderer = continuation_item.get('playlistVideoRenderer') or continuation_item.get('itemSectionRenderer')
3137 if renderer:
3138 video_list_renderer = {'contents': continuation_items}
3139 for entry in self._playlist_entries(video_list_renderer):
3140 yield entry
3141 continuation = self._extract_continuation(video_list_renderer)
3142 continue
3143 break
3144
3145 @staticmethod
3146 def _extract_selected_tab(tabs):
3147 for tab in tabs:
3148 if try_get(tab, lambda x: x['tabRenderer']['selected'], bool):
3149 return tab['tabRenderer']
3150 else:
3151 raise ExtractorError('Unable to find selected tab')
3152
3153 @staticmethod
3154 def _extract_uploader(data):
3155 uploader = {}
3156 sidebar_renderer = try_get(
3157 data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list)
3158 if sidebar_renderer:
3159 for item in sidebar_renderer:
3160 if not isinstance(item, dict):
3161 continue
3162 renderer = item.get('playlistSidebarSecondaryInfoRenderer')
3163 if not isinstance(renderer, dict):
3164 continue
3165 owner = try_get(
3166 renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict)
3167 if owner:
3168 uploader['uploader'] = owner.get('text')
3169 uploader['uploader_id'] = try_get(
3170 owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str)
3171 uploader['uploader_url'] = urljoin(
3172 'https://www.youtube.com/',
3173 try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
3174 return uploader
3175
3176 def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
3177 selected_tab = self._extract_selected_tab(tabs)
3178 renderer = try_get(
3179 data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
3180 playlist_id = title = description = None
3181 if renderer:
3182 channel_title = renderer.get('title') or item_id
3183 tab_title = selected_tab.get('title')
3184 title = channel_title or item_id
3185 if tab_title:
3186 title += ' - %s' % tab_title
3187 description = renderer.get('description')
3188 playlist_id = renderer.get('externalId')
3189 renderer = try_get(
3190 data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
3191 if renderer:
3192 title = renderer.get('title')
3193 description = None
3194 playlist_id = item_id
3195 if playlist_id is None:
3196 playlist_id = item_id
3197 if title is None:
3198 title = "Youtube " + playlist_id.title()
3199 playlist = self.playlist_result(
3200 self._entries(selected_tab['content'], identity_token),
3201 playlist_id=playlist_id, playlist_title=title,
3202 playlist_description=description)
3203 playlist.update(self._extract_uploader(data))
3204 return playlist
3205
3206 def _extract_from_playlist(self, item_id, data, playlist):
3207 title = playlist.get('title') or try_get(
3208 data, lambda x: x['titleText']['simpleText'], compat_str)
3209 playlist_id = playlist.get('playlistId') or item_id
3210 return self.playlist_result(
3211 self._playlist_entries(playlist), playlist_id=playlist_id,
3212 playlist_title=title)
3213
3214 def _real_extract(self, url):
3215 item_id = self._match_id(url)
3216 url = compat_urlparse.urlunparse(
3217 compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
3218 is_home = re.match(r'(?P<pre>%s)(?P<post>/?(?![^#?]).*$)' % self._VALID_URL, url)
3219 if is_home is not None and is_home.group('not_channel') is None and item_id != 'feed':
3220 self._downloader.report_warning(
3221 'A channel/user page was given. All the channel\'s videos will be downloaded. '
3222 'To download only the videos in the home page, add a "/home" to the URL')
3223 url = '%s/videos%s' % (is_home.group('pre'), is_home.group('post') or '')
3224
3225 # Handle both video/playlist URLs
3226 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3227 video_id = qs.get('v', [None])[0]
3228 playlist_id = qs.get('list', [None])[0]
3229 if video_id and playlist_id:
3230 if self._downloader.params.get('noplaylist'):
3231 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
3232 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
3233 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
3234 webpage = self._download_webpage(url, item_id)
3235 identity_token = self._search_regex(
3236 r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
3237 'identity token', default=None)
3238 data = self._extract_yt_initial_data(item_id, webpage)
3239 tabs = try_get(
3240 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
3241 if tabs:
3242 return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token)
3243 playlist = try_get(
3244 data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
3245 if playlist:
3246 return self._extract_from_playlist(item_id, data, playlist)
3247 # Fallback to video extraction if no playlist alike page is recognized.
3248 # First check for the current video then try the v attribute of URL query.
3249 video_id = try_get(
3250 data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'],
3251 compat_str) or video_id
3252 if video_id:
3253 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
3254 # Failed to recognize
3255 raise ExtractorError('Unable to recognize tab page')
3256
3257
3258class YoutubePlaylistIE(InfoExtractor):
3259 IE_DESC = 'YouTube.com playlists'
3260 _VALID_URL = r'''(?x)(?:
3261 (?:https?://)?
3262 (?:\w+\.)?
3263 (?:
3264 (?:
3265 youtube(?:kids)?\.com|
3266 invidio\.us|
3267 youtu\.be
3268 )
3269 /.*?\?.*?\blist=
3270 )?
3271 (?P<id>%(playlist_id)s)
3272 )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
3273 IE_NAME = 'youtube:playlist'
3274 _TESTS = [{
3275 'note': 'issue #673',
3276 'url': 'PLBB231211A4F62143',
3277 'info_dict': {
3278 'title': '[OLD]Team Fortress 2 (Class-based LP)',
3279 'id': 'PLBB231211A4F62143',
3280 'uploader': 'Wickydoo',
3281 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
3282 },
3283 'playlist_mincount': 29,
3284 }, {
3285 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
3286 'info_dict': {
3287 'title': 'YDL_safe_search',
3288 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
3289 },
3290 'playlist_count': 2,
3291 'skip': 'This playlist is private',
3292 }, {
3293 'note': 'embedded',
3294 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
3295 'playlist_count': 4,
3296 'info_dict': {
3297 'title': 'JODA15',
3298 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
3299 'uploader': 'milan',
3300 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
3301 }
3302 }, {
3303 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
3304 'playlist_mincount': 982,
3305 'info_dict': {
3306 'title': '2018 Chinese New Singles (11/6 updated)',
3307 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
3308 'uploader': 'LBK',
3309 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
3310 }
3311 }, {
3312 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
3313 'info_dict': {
3314 'id': 'yeWKywCrFtk',
3315 'ext': 'mp4',
3316 'title': 'Small Scale Baler and Braiding Rugs',
3317 'uploader': 'Backus-Page House Museum',
3318 'uploader_id': 'backuspagemuseum',
3319 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
3320 'upload_date': '20161008',
3321 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
3322 'categories': ['Nonprofits & Activism'],
3323 'tags': list,
3324 'like_count': int,
3325 'dislike_count': int,
3326 },
3327 'params': {
3328 'noplaylist': True,
3329 'skip_download': True,
3330 },
3331 }, {
3332 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
3333 'only_matching': True,
3334 }, {
3335 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
3336 'only_matching': True,
3337 }, {
3338 # music album playlist
3339 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
3340 'only_matching': True,
3341 }]
3342
3343 @classmethod
3344 def suitable(cls, url):
3345 return False if YoutubeTabIE.suitable(url) else super(
3346 YoutubePlaylistIE, cls).suitable(url)
3347
3348 def _real_extract(self, url):
3349 playlist_id = self._match_id(url)
3350 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3351 if not qs:
3352 qs = {'list': playlist_id}
3353 return self.url_result(
3354 update_url_query('https://www.youtube.com/playlist', qs),
3355 ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
3356
3357
3358class YoutubeYtUserIE(InfoExtractor):
3359 _VALID_URL = r'ytuser:(?P<id>.+)'
3360 _TESTS = [{
3361 'url': 'ytuser:phihag',
3362 'only_matching': True,
3363 }]
3364
3365 def _real_extract(self, url):
3366 user_id = self._match_id(url)
3367 return self.url_result(
3368 'https://www.youtube.com/user/%s' % user_id,
3369 ie=YoutubeTabIE.ie_key(), video_id=user_id)
3370
3371
3372class YoutubeFavouritesIE(InfoExtractor):
3373 IE_NAME = 'youtube:favorites'
3374 IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)'
3375 _VALID_URL = r':ytfav(?:ou?rite)?s?'
3376 _LOGIN_REQUIRED = True
3377 _TESTS = [{
3378 'url': ':ytfav',
3379 'only_matching': True,
3380 }, {
3381 'url': ':ytfavorites',
3382 'only_matching': True,
3383 }]
3384
3385 def _real_extract(self, url):
3386 return self.url_result(
3387 'https://www.youtube.com/playlist?list=LL',
3388 ie=YoutubeTabIE.ie_key())
3389
3390
3391class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):
3392 IE_DESC = 'YouTube.com searches'
3393 # there doesn't appear to be a real limit, for example if you search for
3394 # 'python' you get more than 8.000.000 results
3395 _MAX_RESULTS = float('inf')
3396 IE_NAME = 'youtube:search'
3397 _SEARCH_KEY = 'ytsearch'
3398 _SEARCH_PARAMS = None
3399 _TESTS = []
3400
3401 def _entries(self, query, n):
3402 data = {
3403 'context': {
3404 'client': {
3405 'clientName': 'WEB',
3406 'clientVersion': '2.20201021.03.00',
3407 }
3408 },
3409 'query': query,
3410 }
3411 if self._SEARCH_PARAMS:
3412 data['params'] = self._SEARCH_PARAMS
3413 total = 0
3414 for page_num in itertools.count(1):
3415 search = self._download_json(
3416 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
3417 video_id='query "%s"' % query,
3418 note='Downloading page %s' % page_num,
3419 errnote='Unable to download API page', fatal=False,
3420 data=json.dumps(data).encode('utf8'),
3421 headers={'content-type': 'application/json'})
3422 if not search:
3423 break
3424 slr_contents = try_get(
3425 search,
3426 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
3427 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
3428 list)
3429 if not slr_contents:
3430 break
3431 isr_contents = try_get(
3432 slr_contents,
3433 lambda x: x[0]['itemSectionRenderer']['contents'],
3434 list)
3435 if not isr_contents:
3436 break
3437 for content in isr_contents:
3438 if not isinstance(content, dict):
3439 continue
3440 video = content.get('videoRenderer')
3441 if not isinstance(video, dict):
3442 continue
3443 video_id = video.get('videoId')
3444 if not video_id:
3445 continue
3446 title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str)
3447 description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str)
3448 duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str))
3449 view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
3450 view_count = int_or_none(self._search_regex(
3451 r'^(\d+)', re.sub(r'\s', '', view_count_text),
3452 'view count', default=None))
3453 uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
3454 total += 1
3455 yield {
3456 '_type': 'url_transparent',
3457 'ie_key': YoutubeIE.ie_key(),
3458 'id': video_id,
3459 'url': video_id,
3460 'title': title,
3461 'description': description,
3462 'duration': duration,
3463 'view_count': view_count,
3464 'uploader': uploader,
3465 }
3466 if total == n:
3467 return
3468 token = try_get(
3469 slr_contents,
3470 lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
3471 compat_str)
3472 if not token:
3473 break
3474 data['continuation'] = token
3475
3476 def _get_n_results(self, query, n):
3477 """Get a specified number of results for a query"""
3478 return self.playlist_result(self._entries(query, n), query)
3479
3480
3481class YoutubeSearchDateIE(YoutubeSearchIE):
3482 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
3483 _SEARCH_KEY = 'ytsearchdate'
3484 IE_DESC = 'YouTube.com searches, newest videos first'
3485 _SEARCH_PARAMS = 'CAI%3D'
3486
3487
3488class YoutubeSearchURLIE(YoutubeSearchIE):
3489 IE_DESC = 'YouTube.com search URLs'
3490 IE_NAME = YoutubeSearchIE.IE_NAME + '_url'
3491 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)'
3492 # _MAX_RESULTS = 100
3493 _TESTS = [{
3494 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
3495 'playlist_mincount': 5,
3496 'info_dict': {
3497 'title': 'youtube-dl test video',
3498 }
3499 }, {
3500 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
3501 'only_matching': True,
3502 }]
3503
3504 @classmethod
3505 def _make_valid_url(cls):
3506 return cls._VALID_URL
3507
3508 def _real_extract(self, url):
3509 qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
3510 query = (qs.get('search_query') or qs.get('q'))[0]
3511 self._SEARCH_PARAMS = qs.get('sp', ('',))[0]
3512 return self._get_n_results(query, self._MAX_RESULTS)
3513
3514
3515class YoutubeFeedsInfoExtractor(YoutubeTabIE):
3516 """
3517 Base class for feed extractors
3518 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
3519 """
3520 _LOGIN_REQUIRED = True
3521 # _MAX_PAGES = 5
3522 _TESTS = []
3523
3524 @property
3525 def IE_NAME(self):
3526 return 'youtube:%s' % self._FEED_NAME
3527
3528 def _real_initialize(self):
3529 self._login()
3530
3531 def _shelf_entries(self, shelf_renderer):
3532 renderer = try_get(shelf_renderer, lambda x: x['content']['gridRenderer'], dict)
3533 if not renderer:
3534 return
3535 for entry in self._grid_entries(renderer):
3536 yield entry
3537
3538 def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
3539 selected_tab = self._extract_selected_tab(tabs)
3540 return self.playlist_result(
3541 self._entries(selected_tab['content'], identity_token),
3542 playlist_title=self._PLAYLIST_TITLE)
3543
3544 def _real_extract(self, url):
3545 item_id = self._FEED_NAME
3546 url = 'https://www.youtube.com/feed/%s' % self._FEED_NAME
3547 webpage = self._download_webpage(url, item_id)
3548 identity_token = self._search_regex(
3549 r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
3550 'identity token', default=None)
3551 data = self._extract_yt_initial_data(item_id, webpage)
3552 tabs = try_get(
3553 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
3554 if tabs:
3555 return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token)
3556 # Failed to recognize
3557 raise ExtractorError('Unable to recognize feed page')
3558
3559
3560class YoutubeWatchLaterIE(InfoExtractor):
3561 IE_NAME = 'youtube:watchlater'
3562 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/watch_later|:ytwatchlater|WL'
3563
3564 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
3565 _TESTS = [{
3566 'url': 'https://www.youtube.com/feed/watch_later',
3567 'only_matching': True,
3568 }, {
3569 'url': ':ytwatchlater',
3570 'only_matching': True,
3571 }]
3572
3573 def _real_extract(self, url):
3574 return self.url_result(
3575 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key())
3576
3577
3578class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3579 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
3580 _VALID_URL = r'https?://(?:www\.)?youtube\.com(?:/feed/recommended|/?[?#]|/?$)|:ytrec(?:ommended)?'
3581 _FEED_NAME = 'recommended'
3582 _PLAYLIST_TITLE = 'Youtube Recommended videos'
3583
3584
3585class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
3586 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsub(?:scription)?s?'
3587 IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)'
3588 _FEED_NAME = 'subscriptions'
3589 _PLAYLIST_TITLE = 'Youtube Subscriptions'
3590
3591
3592class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
3593 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
3594 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
3595 _FEED_NAME = 'history'
3596 _PLAYLIST_TITLE = 'Youtube History'
3597
3598
3599class YoutubeTruncatedURLIE(InfoExtractor):
3600 IE_NAME = 'youtube:truncated_url'
3601 IE_DESC = False # Do not list
3602 _VALID_URL = r'''(?x)
3603 (?:https?://)?
3604 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3605 (?:watch\?(?:
3606 feature=[a-z_]+|
3607 annotation_id=annotation_[^&]+|
3608 x-yt-cl=[0-9]+|
3609 hl=[^&]*|
3610 t=[0-9]+
3611 )?
3612 |
3613 attribution_link\?a=[^&]+
3614 )
3615 $
3616 '''
3617
3618 _TESTS = [{
3619 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
3620 'only_matching': True,
3621 }, {
3622 'url': 'https://www.youtube.com/watch?',
3623 'only_matching': True,
3624 }, {
3625 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3626 'only_matching': True,
3627 }, {
3628 'url': 'https://www.youtube.com/watch?feature=foo',
3629 'only_matching': True,
3630 }, {
3631 'url': 'https://www.youtube.com/watch?hl=en-GB',
3632 'only_matching': True,
3633 }, {
3634 'url': 'https://www.youtube.com/watch?t=2372',
3635 'only_matching': True,
3636 }]
3637
3638 def _real_extract(self, url):
3639 raise ExtractorError(
3640 'Did you forget to quote the URL? Remember that & is a meta '
3641 'character in most shells, so you want to put the URL in quotes, '
3642 'like youtube-dl '
3643 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3644 ' or simply youtube-dl BaW_jenozKc .',
3645 expected=True)
3646
3647
3648class YoutubeTruncatedIDIE(InfoExtractor):
3649 IE_NAME = 'youtube:truncated_id'
3650 IE_DESC = False # Do not list
3651 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
3652
3653 _TESTS = [{
3654 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3655 'only_matching': True,
3656 }]
3657
3658 def _real_extract(self, url):
3659 video_id = self._match_id(url)
3660 raise ExtractorError(
3661 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3662 expected=True)
3663
3664
3665# Do Youtube show urls even exist anymore? I couldn't find any
3666r'''
3667class YoutubeShowIE(YoutubeTabIE):
3668 IE_DESC = 'YouTube.com (multi-season) shows'
3669 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
3670 IE_NAME = 'youtube:show'
3671 _TESTS = [{
3672 'url': 'https://www.youtube.com/show/airdisasters',
3673 'playlist_mincount': 5,
3674 'info_dict': {
3675 'id': 'airdisasters',
3676 'title': 'Air Disasters',
3677 }
3678 }]
3679
3680 def _real_extract(self, url):
3681 playlist_id = self._match_id(url)
3682 return super(YoutubeShowIE, self)._real_extract(
3683 'https://www.youtube.com/show/%s/playlists' % playlist_id)
3684'''