]> jfr.im git - yt-dlp.git/blob - youtube_dlc/extractor/youtube.py
[youtube_live_chat] Fix URL
[yt-dlp.git] / youtube_dlc / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5
6 import itertools
7 import json
8 import os.path
9 import random
10 import re
11 import time
12 import traceback
13
14 from .common import InfoExtractor, SearchInfoExtractor
15 from ..jsinterp import JSInterpreter
16 from ..swfinterp import SWFInterpreter
17 from ..compat import (
18 compat_chr,
19 compat_HTTPError,
20 compat_kwargs,
21 compat_parse_qs,
22 compat_urllib_parse_unquote,
23 compat_urllib_parse_unquote_plus,
24 compat_urllib_parse_urlencode,
25 compat_urllib_parse_urlparse,
26 compat_urlparse,
27 compat_str,
28 )
29 from ..utils import (
30 bool_or_none,
31 clean_html,
32 error_to_compat_str,
33 ExtractorError,
34 format_field,
35 float_or_none,
36 get_element_by_id,
37 int_or_none,
38 mimetype2ext,
39 parse_codecs,
40 parse_count,
41 parse_duration,
42 remove_quotes,
43 remove_start,
44 smuggle_url,
45 str_or_none,
46 str_to_int,
47 try_get,
48 unescapeHTML,
49 unified_strdate,
50 unsmuggle_url,
51 update_url_query,
52 uppercase_escape,
53 url_or_none,
54 urlencode_postdata,
55 urljoin,
56 )
57
58
59 class YoutubeBaseInfoExtractor(InfoExtractor):
60 """Provide base functions for Youtube extractors"""
61 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
62 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
63
64 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
65 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
66 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
67
68 _RESERVED_NAMES = (
69 r'embed|e|watch_popup|channel|c|user|playlist|watch|w|v|movies|results|shared|'
70 r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout|'
71 r'feed/(?:watch_later|history|subscriptions|library|trending|recommended)')
72
73 _NETRC_MACHINE = 'youtube'
74 # If True it will raise an error if no login info is provided
75 _LOGIN_REQUIRED = False
76
77 _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)'
78
79 def _set_language(self):
80 self._set_cookie(
81 '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
82 # YouTube sets the expire time to about two months
83 expire_time=time.time() + 2 * 30 * 24 * 3600)
84
85 def _ids_to_results(self, ids):
86 return [
87 self.url_result(vid_id, 'Youtube', video_id=vid_id)
88 for vid_id in ids]
89
90 def _login(self):
91 """
92 Attempt to log in to YouTube.
93 True is returned if successful or skipped.
94 False is returned if login failed.
95
96 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
97 """
98 username, password = self._get_login_info()
99 # No authentication to be performed
100 if username is None:
101 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
102 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
103 if self._downloader.params.get('cookiefile') and False: # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
104 self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
105 return True
106
107 login_page = self._download_webpage(
108 self._LOGIN_URL, None,
109 note='Downloading login page',
110 errnote='unable to fetch login page', fatal=False)
111 if login_page is False:
112 return
113
114 login_form = self._hidden_inputs(login_page)
115
116 def req(url, f_req, note, errnote):
117 data = login_form.copy()
118 data.update({
119 'pstMsg': 1,
120 'checkConnection': 'youtube',
121 'checkedDomains': 'youtube',
122 'hl': 'en',
123 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
124 'f.req': json.dumps(f_req),
125 'flowName': 'GlifWebSignIn',
126 'flowEntry': 'ServiceLogin',
127 # TODO: reverse actual botguard identifier generation algo
128 'bgRequest': '["identifier",""]',
129 })
130 return self._download_json(
131 url, None, note=note, errnote=errnote,
132 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
133 fatal=False,
134 data=urlencode_postdata(data), headers={
135 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
136 'Google-Accounts-XSRF': 1,
137 })
138
139 def warn(message):
140 self._downloader.report_warning(message)
141
142 lookup_req = [
143 username,
144 None, [], None, 'US', None, None, 2, False, True,
145 [
146 None, None,
147 [2, 1, None, 1,
148 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
149 None, [], 4],
150 1, [None, None, []], None, None, None, True
151 ],
152 username,
153 ]
154
155 lookup_results = req(
156 self._LOOKUP_URL, lookup_req,
157 'Looking up account info', 'Unable to look up account info')
158
159 if lookup_results is False:
160 return False
161
162 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
163 if not user_hash:
164 warn('Unable to extract user hash')
165 return False
166
167 challenge_req = [
168 user_hash,
169 None, 1, None, [1, None, None, None, [password, None, True]],
170 [
171 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
172 1, [None, None, []], None, None, None, True
173 ]]
174
175 challenge_results = req(
176 self._CHALLENGE_URL, challenge_req,
177 'Logging in', 'Unable to log in')
178
179 if challenge_results is False:
180 return
181
182 login_res = try_get(challenge_results, lambda x: x[0][5], list)
183 if login_res:
184 login_msg = try_get(login_res, lambda x: x[5], compat_str)
185 warn(
186 'Unable to login: %s' % 'Invalid password'
187 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
188 return False
189
190 res = try_get(challenge_results, lambda x: x[0][-1], list)
191 if not res:
192 warn('Unable to extract result entry')
193 return False
194
195 login_challenge = try_get(res, lambda x: x[0][0], list)
196 if login_challenge:
197 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
198 if challenge_str == 'TWO_STEP_VERIFICATION':
199 # SEND_SUCCESS - TFA code has been successfully sent to phone
200 # QUOTA_EXCEEDED - reached the limit of TFA codes
201 status = try_get(login_challenge, lambda x: x[5], compat_str)
202 if status == 'QUOTA_EXCEEDED':
203 warn('Exceeded the limit of TFA codes, try later')
204 return False
205
206 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
207 if not tl:
208 warn('Unable to extract TL')
209 return False
210
211 tfa_code = self._get_tfa_info('2-step verification code')
212
213 if not tfa_code:
214 warn(
215 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
216 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
217 return False
218
219 tfa_code = remove_start(tfa_code, 'G-')
220
221 tfa_req = [
222 user_hash, None, 2, None,
223 [
224 9, None, None, None, None, None, None, None,
225 [None, tfa_code, True, 2]
226 ]]
227
228 tfa_results = req(
229 self._TFA_URL.format(tl), tfa_req,
230 'Submitting TFA code', 'Unable to submit TFA code')
231
232 if tfa_results is False:
233 return False
234
235 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
236 if tfa_res:
237 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
238 warn(
239 'Unable to finish TFA: %s' % 'Invalid TFA code'
240 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
241 return False
242
243 check_cookie_url = try_get(
244 tfa_results, lambda x: x[0][-1][2], compat_str)
245 else:
246 CHALLENGES = {
247 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
248 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
249 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
250 }
251 challenge = CHALLENGES.get(
252 challenge_str,
253 '%s returned error %s.' % (self.IE_NAME, challenge_str))
254 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
255 return False
256 else:
257 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
258
259 if not check_cookie_url:
260 warn('Unable to extract CheckCookie URL')
261 return False
262
263 check_cookie_results = self._download_webpage(
264 check_cookie_url, None, 'Checking cookie', fatal=False)
265
266 if check_cookie_results is False:
267 return False
268
269 if 'https://myaccount.google.com/' not in check_cookie_results:
270 warn('Unable to log in')
271 return False
272
273 return True
274
275 def _download_webpage_handle(self, *args, **kwargs):
276 query = kwargs.get('query', {}).copy()
277 kwargs['query'] = query
278 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
279 *args, **compat_kwargs(kwargs))
280
281 def _real_initialize(self):
282 if self._downloader is None:
283 return
284 self._set_language()
285 if not self._login():
286 return
287
288 _DEFAULT_API_DATA = {
289 'context': {
290 'client': {
291 'clientName': 'WEB',
292 'clientVersion': '2.20201021.03.00',
293 }
294 },
295 }
296
297 _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
298 _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
299 _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
300
301 def _call_api(self, ep, query, video_id):
302 data = self._DEFAULT_API_DATA.copy()
303 data.update(query)
304
305 response = self._download_json(
306 'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id,
307 note='Downloading API JSON', errnote='Unable to download API page',
308 data=json.dumps(data).encode('utf8'),
309 headers={'content-type': 'application/json'},
310 query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'})
311
312 return response
313
314 def _extract_yt_initial_data(self, video_id, webpage):
315 return self._parse_json(
316 self._search_regex(
317 (r'%s\s*%s' % (self._YT_INITIAL_DATA_RE, self._YT_INITIAL_BOUNDARY_RE),
318 self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
319 video_id)
320
321 def _extract_ytcfg(self, video_id, webpage):
322 return self._parse_json(
323 self._search_regex(
324 r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg',
325 default='{}'), video_id, fatal=False)
326
327 def _extract_video(self, renderer):
328 video_id = renderer.get('videoId')
329 title = try_get(
330 renderer,
331 (lambda x: x['title']['runs'][0]['text'],
332 lambda x: x['title']['simpleText']), compat_str)
333 description = try_get(
334 renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'],
335 compat_str)
336 duration = parse_duration(try_get(
337 renderer, lambda x: x['lengthText']['simpleText'], compat_str))
338 view_count_text = try_get(
339 renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
340 view_count = str_to_int(self._search_regex(
341 r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
342 'view count', default=None))
343 uploader = try_get(
344 renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
345 return {
346 '_type': 'url_transparent',
347 'ie_key': YoutubeIE.ie_key(),
348 'id': video_id,
349 'url': video_id,
350 'title': title,
351 'description': description,
352 'duration': duration,
353 'view_count': view_count,
354 'uploader': uploader,
355 }
356
357
358 class YoutubeIE(YoutubeBaseInfoExtractor):
359 IE_DESC = 'YouTube.com'
360 _VALID_URL = r"""(?x)^
361 (
362 (?:https?://|//) # http(s):// or protocol-independent URL
363 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
364 (?:www\.)?deturl\.com/www\.youtube\.com/|
365 (?:www\.)?pwnyoutube\.com/|
366 (?:www\.)?hooktube\.com/|
367 (?:www\.)?yourepeat\.com/|
368 tube\.majestyc\.net/|
369 # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
370 (?:(?:www|dev)\.)?invidio\.us/|
371 (?:(?:www|no)\.)?invidiou\.sh/|
372 (?:(?:www|fi)\.)?invidious\.snopyta\.org/|
373 (?:www\.)?invidious\.kabi\.tk/|
374 (?:www\.)?invidious\.13ad\.de/|
375 (?:www\.)?invidious\.mastodon\.host/|
376 (?:www\.)?invidious\.zapashcanon\.fr/|
377 (?:www\.)?invidious\.kavin\.rocks/|
378 (?:www\.)?invidious\.tube/|
379 (?:www\.)?invidiou\.site/|
380 (?:www\.)?invidious\.site/|
381 (?:www\.)?invidious\.xyz/|
382 (?:www\.)?invidious\.nixnet\.xyz/|
383 (?:www\.)?invidious\.drycat\.fr/|
384 (?:www\.)?tube\.poal\.co/|
385 (?:www\.)?tube\.connect\.cafe/|
386 (?:www\.)?vid\.wxzm\.sx/|
387 (?:www\.)?vid\.mint\.lgbt/|
388 (?:www\.)?yewtu\.be/|
389 (?:www\.)?yt\.elukerio\.org/|
390 (?:www\.)?yt\.lelux\.fi/|
391 (?:www\.)?invidious\.ggc-project\.de/|
392 (?:www\.)?yt\.maisputain\.ovh/|
393 (?:www\.)?invidious\.13ad\.de/|
394 (?:www\.)?invidious\.toot\.koeln/|
395 (?:www\.)?invidious\.fdn\.fr/|
396 (?:www\.)?watch\.nettohikari\.com/|
397 (?:www\.)?kgg2m7yk5aybusll\.onion/|
398 (?:www\.)?qklhadlycap4cnod\.onion/|
399 (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
400 (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
401 (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
402 (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
403 (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
404 (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
405 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
406 (?:.*?\#/)? # handle anchor (#/) redirect urls
407 (?: # the various things that can precede the ID:
408 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
409 |(?: # or the v= param in all its forms
410 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
411 (?:\?|\#!?) # the params delimiter ? or # or #!
412 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
413 v=
414 )
415 ))
416 |(?:
417 youtu\.be| # just youtu.be/xxxx
418 vid\.plus| # or vid.plus/xxxx
419 zwearz\.com/watch| # or zwearz.com/watch/xxxx
420 )/
421 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
422 )
423 )? # all until now is optional -> you can pass the naked ID
424 (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
425 (?!.*?\blist=
426 (?:
427 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
428 WL # WL are handled by the watch later IE
429 )
430 )
431 (?(1).+)? # if we found the ID, everything can follow
432 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
433 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
434 _PLAYER_INFO_RE = (
435 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P<ext>[a-z]+)$',
436 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.(?P<ext>[a-z]+)$',
437 )
438 _formats = {
439 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
440 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
441 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
442 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
443 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
444 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
445 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
446 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
447 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
448 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
449 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
450 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
451 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
452 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
453 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
454 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
455 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
456 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
457
458
459 # 3D videos
460 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
461 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
462 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
463 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
464 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
465 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
466 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
467
468 # Apple HTTP Live Streaming
469 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
470 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
471 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
472 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
473 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
474 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
475 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
476 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
477
478 # DASH mp4 video
479 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
480 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
481 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
482 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
483 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
484 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
485 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
486 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
487 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
488 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
489 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
490 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
491
492 # Dash mp4 audio
493 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
494 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
495 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
496 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
497 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
498 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
499 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
500
501 # Dash webm
502 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
503 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
504 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
505 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
506 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
507 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
508 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
509 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
510 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
511 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
512 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
513 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
514 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
515 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
516 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
517 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
518 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
519 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
520 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
521 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
522 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
523 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
524
525 # Dash webm audio
526 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
527 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
528
529 # Dash webm audio with opus inside
530 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
531 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
532 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
533
534 # RTMP (unnamed)
535 '_rtmp': {'protocol': 'rtmp'},
536
537 # av01 video only formats sometimes served with "unknown" codecs
538 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
539 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
540 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
541 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
542 }
543 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
544
545 _GEO_BYPASS = False
546
547 IE_NAME = 'youtube'
548 _TESTS = [
549 {
550 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
551 'info_dict': {
552 'id': 'BaW_jenozKc',
553 'ext': 'mp4',
554 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
555 'uploader': 'Philipp Hagemeister',
556 'uploader_id': 'phihag',
557 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
558 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
559 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
560 'upload_date': '20121002',
561 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
562 'categories': ['Science & Technology'],
563 'tags': ['youtube-dl'],
564 'duration': 10,
565 'view_count': int,
566 'like_count': int,
567 'dislike_count': int,
568 'start_time': 1,
569 'end_time': 9,
570 }
571 },
572 {
573 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
574 'note': 'Embed-only video (#1746)',
575 'info_dict': {
576 'id': 'yZIXLfi8CZQ',
577 'ext': 'mp4',
578 'upload_date': '20120608',
579 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
580 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
581 'uploader': 'SET India',
582 'uploader_id': 'setindia',
583 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
584 'age_limit': 18,
585 }
586 },
587 {
588 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ',
589 'note': 'Use the first video ID in the URL',
590 'info_dict': {
591 'id': 'BaW_jenozKc',
592 'ext': 'mp4',
593 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
594 'uploader': 'Philipp Hagemeister',
595 'uploader_id': 'phihag',
596 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
597 'upload_date': '20121002',
598 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
599 'categories': ['Science & Technology'],
600 'tags': ['youtube-dl'],
601 'duration': 10,
602 'view_count': int,
603 'like_count': int,
604 'dislike_count': int,
605 },
606 'params': {
607 'skip_download': True,
608 },
609 },
610 {
611 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
612 'note': '256k DASH audio (format 141) via DASH manifest',
613 'info_dict': {
614 'id': 'a9LDPn-MO4I',
615 'ext': 'm4a',
616 'upload_date': '20121002',
617 'uploader_id': '8KVIDEO',
618 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
619 'description': '',
620 'uploader': '8KVIDEO',
621 'title': 'UHDTV TEST 8K VIDEO.mp4'
622 },
623 'params': {
624 'youtube_include_dash_manifest': True,
625 'format': '141',
626 },
627 'skip': 'format 141 not served anymore',
628 },
629 # DASH manifest with encrypted signature
630 {
631 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
632 'info_dict': {
633 'id': 'IB3lcPjvWLA',
634 'ext': 'm4a',
635 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
636 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
637 'duration': 244,
638 'uploader': 'AfrojackVEVO',
639 'uploader_id': 'AfrojackVEVO',
640 'upload_date': '20131011',
641 },
642 'params': {
643 'youtube_include_dash_manifest': True,
644 'format': '141/bestaudio[ext=m4a]',
645 },
646 },
647 # Controversy video
648 {
649 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
650 'info_dict': {
651 'id': 'T4XJQO3qol8',
652 'ext': 'mp4',
653 'duration': 219,
654 'upload_date': '20100909',
655 'uploader': 'Amazing Atheist',
656 'uploader_id': 'TheAmazingAtheist',
657 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
658 'title': 'Burning Everyone\'s Koran',
659 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
660 }
661 },
662 # Normal age-gate video (embed allowed)
663 {
664 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
665 'info_dict': {
666 'id': 'HtVdAasjOgU',
667 'ext': 'mp4',
668 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
669 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
670 'duration': 142,
671 'uploader': 'The Witcher',
672 'uploader_id': 'WitcherGame',
673 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
674 'upload_date': '20140605',
675 'age_limit': 18,
676 },
677 },
678 # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
679 # YouTube Red ad is not captured for creator
680 {
681 'url': '__2ABJjxzNo',
682 'info_dict': {
683 'id': '__2ABJjxzNo',
684 'ext': 'mp4',
685 'duration': 266,
686 'upload_date': '20100430',
687 'uploader_id': 'deadmau5',
688 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
689 'creator': 'Dada Life, deadmau5',
690 'description': 'md5:12c56784b8032162bb936a5f76d55360',
691 'uploader': 'deadmau5',
692 'title': 'Deadmau5 - Some Chords (HD)',
693 'alt_title': 'This Machine Kills Some Chords',
694 },
695 'expected_warnings': [
696 'DASH manifest missing',
697 ]
698 },
699 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
700 {
701 'url': 'lqQg6PlCWgI',
702 'info_dict': {
703 'id': 'lqQg6PlCWgI',
704 'ext': 'mp4',
705 'duration': 6085,
706 'upload_date': '20150827',
707 'uploader_id': 'olympic',
708 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
709 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
710 'uploader': 'Olympic',
711 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
712 },
713 'params': {
714 'skip_download': 'requires avconv',
715 }
716 },
717 # Non-square pixels
718 {
719 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
720 'info_dict': {
721 'id': '_b-2C3KPAM0',
722 'ext': 'mp4',
723 'stretched_ratio': 16 / 9.,
724 'duration': 85,
725 'upload_date': '20110310',
726 'uploader_id': 'AllenMeow',
727 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
728 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
729 'uploader': '孫ᄋᄅ',
730 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
731 },
732 },
733 # url_encoded_fmt_stream_map is empty string
734 {
735 'url': 'qEJwOuvDf7I',
736 'info_dict': {
737 'id': 'qEJwOuvDf7I',
738 'ext': 'webm',
739 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
740 'description': '',
741 'upload_date': '20150404',
742 'uploader_id': 'spbelect',
743 'uploader': 'Наблюдатели Петербурга',
744 },
745 'params': {
746 'skip_download': 'requires avconv',
747 },
748 'skip': 'This live event has ended.',
749 },
750 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
751 {
752 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
753 'info_dict': {
754 'id': 'FIl7x6_3R5Y',
755 'ext': 'webm',
756 'title': 'md5:7b81415841e02ecd4313668cde88737a',
757 'description': 'md5:116377fd2963b81ec4ce64b542173306',
758 'duration': 220,
759 'upload_date': '20150625',
760 'uploader_id': 'dorappi2000',
761 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
762 'uploader': 'dorappi2000',
763 'formats': 'mincount:31',
764 },
765 'skip': 'not actual anymore',
766 },
767 # DASH manifest with segment_list
768 {
769 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
770 'md5': '8ce563a1d667b599d21064e982ab9e31',
771 'info_dict': {
772 'id': 'CsmdDsKjzN8',
773 'ext': 'mp4',
774 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
775 'uploader': 'Airtek',
776 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
777 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
778 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
779 },
780 'params': {
781 'youtube_include_dash_manifest': True,
782 'format': '135', # bestvideo
783 },
784 'skip': 'This live event has ended.',
785 },
786 {
787 # Multifeed videos (multiple cameras), URL is for Main Camera
788 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
789 'info_dict': {
790 'id': 'jqWvoWXjCVs',
791 'title': 'teamPGP: Rocket League Noob Stream',
792 'description': 'md5:dc7872fb300e143831327f1bae3af010',
793 },
794 'playlist': [{
795 'info_dict': {
796 'id': 'jqWvoWXjCVs',
797 'ext': 'mp4',
798 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
799 'description': 'md5:dc7872fb300e143831327f1bae3af010',
800 'duration': 7335,
801 'upload_date': '20150721',
802 'uploader': 'Beer Games Beer',
803 'uploader_id': 'beergamesbeer',
804 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
805 'license': 'Standard YouTube License',
806 },
807 }, {
808 'info_dict': {
809 'id': '6h8e8xoXJzg',
810 'ext': 'mp4',
811 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
812 'description': 'md5:dc7872fb300e143831327f1bae3af010',
813 'duration': 7337,
814 'upload_date': '20150721',
815 'uploader': 'Beer Games Beer',
816 'uploader_id': 'beergamesbeer',
817 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
818 'license': 'Standard YouTube License',
819 },
820 }, {
821 'info_dict': {
822 'id': 'PUOgX5z9xZw',
823 'ext': 'mp4',
824 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
825 'description': 'md5:dc7872fb300e143831327f1bae3af010',
826 'duration': 7337,
827 'upload_date': '20150721',
828 'uploader': 'Beer Games Beer',
829 'uploader_id': 'beergamesbeer',
830 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
831 'license': 'Standard YouTube License',
832 },
833 }, {
834 'info_dict': {
835 'id': 'teuwxikvS5k',
836 'ext': 'mp4',
837 'title': 'teamPGP: Rocket League Noob Stream (zim)',
838 'description': 'md5:dc7872fb300e143831327f1bae3af010',
839 'duration': 7334,
840 'upload_date': '20150721',
841 'uploader': 'Beer Games Beer',
842 'uploader_id': 'beergamesbeer',
843 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
844 'license': 'Standard YouTube License',
845 },
846 }],
847 'params': {
848 'skip_download': True,
849 },
850 'skip': 'This video is not available.',
851 },
852 {
853 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
854 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
855 'info_dict': {
856 'id': 'gVfLd0zydlo',
857 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
858 },
859 'playlist_count': 2,
860 'skip': 'Not multifeed anymore',
861 },
862 {
863 'url': 'https://vid.plus/FlRa-iH7PGw',
864 'only_matching': True,
865 },
866 {
867 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
868 'only_matching': True,
869 },
870 {
871 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
872 # Also tests cut-off URL expansion in video description (see
873 # https://github.com/ytdl-org/youtube-dl/issues/1892,
874 # https://github.com/ytdl-org/youtube-dl/issues/8164)
875 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
876 'info_dict': {
877 'id': 'lsguqyKfVQg',
878 'ext': 'mp4',
879 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
880 'alt_title': 'Dark Walk - Position Music',
881 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
882 'duration': 133,
883 'upload_date': '20151119',
884 'uploader_id': 'IronSoulElf',
885 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
886 'uploader': 'IronSoulElf',
887 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
888 'track': 'Dark Walk - Position Music',
889 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
890 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
891 },
892 'params': {
893 'skip_download': True,
894 },
895 },
896 {
897 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
898 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
899 'only_matching': True,
900 },
901 {
902 # Video with yt:stretch=17:0
903 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
904 'info_dict': {
905 'id': 'Q39EVAstoRM',
906 'ext': 'mp4',
907 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
908 'description': 'md5:ee18a25c350637c8faff806845bddee9',
909 'upload_date': '20151107',
910 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
911 'uploader': 'CH GAMER DROID',
912 },
913 'params': {
914 'skip_download': True,
915 },
916 'skip': 'This video does not exist.',
917 },
918 {
919 # Video licensed under Creative Commons
920 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
921 'info_dict': {
922 'id': 'M4gD1WSo5mA',
923 'ext': 'mp4',
924 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
925 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
926 'duration': 721,
927 'upload_date': '20150127',
928 'uploader_id': 'BerkmanCenter',
929 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
930 'uploader': 'The Berkman Klein Center for Internet & Society',
931 'license': 'Creative Commons Attribution license (reuse allowed)',
932 },
933 'params': {
934 'skip_download': True,
935 },
936 },
937 {
938 # Channel-like uploader_url
939 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
940 'info_dict': {
941 'id': 'eQcmzGIKrzg',
942 'ext': 'mp4',
943 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
944 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
945 'duration': 4060,
946 'upload_date': '20151119',
947 'uploader': 'Bernie Sanders',
948 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
949 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
950 'license': 'Creative Commons Attribution license (reuse allowed)',
951 },
952 'params': {
953 'skip_download': True,
954 },
955 },
956 {
957 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
958 'only_matching': True,
959 },
960 {
961 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
962 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
963 'only_matching': True,
964 },
965 {
966 # Rental video preview
967 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
968 'info_dict': {
969 'id': 'uGpuVWrhIzE',
970 'ext': 'mp4',
971 'title': 'Piku - Trailer',
972 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
973 'upload_date': '20150811',
974 'uploader': 'FlixMatrix',
975 'uploader_id': 'FlixMatrixKaravan',
976 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
977 'license': 'Standard YouTube License',
978 },
979 'params': {
980 'skip_download': True,
981 },
982 'skip': 'This video is not available.',
983 },
984 {
985 # YouTube Red video with episode data
986 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
987 'info_dict': {
988 'id': 'iqKdEhx-dD4',
989 'ext': 'mp4',
990 'title': 'Isolation - Mind Field (Ep 1)',
991 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
992 'duration': 2085,
993 'upload_date': '20170118',
994 'uploader': 'Vsauce',
995 'uploader_id': 'Vsauce',
996 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
997 'series': 'Mind Field',
998 'season_number': 1,
999 'episode_number': 1,
1000 },
1001 'params': {
1002 'skip_download': True,
1003 },
1004 'expected_warnings': [
1005 'Skipping DASH manifest',
1006 ],
1007 },
1008 {
1009 # The following content has been identified by the YouTube community
1010 # as inappropriate or offensive to some audiences.
1011 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
1012 'info_dict': {
1013 'id': '6SJNVb0GnPI',
1014 'ext': 'mp4',
1015 'title': 'Race Differences in Intelligence',
1016 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
1017 'duration': 965,
1018 'upload_date': '20140124',
1019 'uploader': 'New Century Foundation',
1020 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
1021 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
1022 },
1023 'params': {
1024 'skip_download': True,
1025 },
1026 },
1027 {
1028 # itag 212
1029 'url': '1t24XAntNCY',
1030 'only_matching': True,
1031 },
1032 {
1033 # geo restricted to JP
1034 'url': 'sJL6WA-aGkQ',
1035 'only_matching': True,
1036 },
1037 {
1038 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1039 'only_matching': True,
1040 },
1041 {
1042 # DRM protected
1043 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1044 'only_matching': True,
1045 },
1046 {
1047 # Video with unsupported adaptive stream type formats
1048 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1049 'info_dict': {
1050 'id': 'Z4Vy8R84T1U',
1051 'ext': 'mp4',
1052 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1053 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1054 'duration': 433,
1055 'upload_date': '20130923',
1056 'uploader': 'Amelia Putri Harwita',
1057 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1058 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1059 'formats': 'maxcount:10',
1060 },
1061 'params': {
1062 'skip_download': True,
1063 'youtube_include_dash_manifest': False,
1064 },
1065 'skip': 'not actual anymore',
1066 },
1067 {
1068 # Youtube Music Auto-generated description
1069 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1070 'info_dict': {
1071 'id': 'MgNrAu2pzNs',
1072 'ext': 'mp4',
1073 'title': 'Voyeur Girl',
1074 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1075 'upload_date': '20190312',
1076 'uploader': 'Stephen - Topic',
1077 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
1078 'artist': 'Stephen',
1079 'track': 'Voyeur Girl',
1080 'album': 'it\'s too much love to know my dear',
1081 'release_date': '20190313',
1082 'release_year': 2019,
1083 },
1084 'params': {
1085 'skip_download': True,
1086 },
1087 },
1088 {
1089 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1090 'only_matching': True,
1091 },
1092 {
1093 # invalid -> valid video id redirection
1094 'url': 'DJztXj2GPfl',
1095 'info_dict': {
1096 'id': 'DJztXj2GPfk',
1097 'ext': 'mp4',
1098 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1099 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1100 'upload_date': '20090125',
1101 'uploader': 'Prochorowka',
1102 'uploader_id': 'Prochorowka',
1103 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1104 'artist': 'Panjabi MC',
1105 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1106 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1107 },
1108 'params': {
1109 'skip_download': True,
1110 },
1111 },
1112 {
1113 # empty description results in an empty string
1114 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1115 'info_dict': {
1116 'id': 'x41yOUIvK2k',
1117 'ext': 'mp4',
1118 'title': 'IMG 3456',
1119 'description': '',
1120 'upload_date': '20170613',
1121 'uploader_id': 'ElevageOrVert',
1122 'uploader': 'ElevageOrVert',
1123 },
1124 'params': {
1125 'skip_download': True,
1126 },
1127 },
1128 {
1129 # with '};' inside yt initial data (see [1])
1130 # see [2] for an example with '};' inside ytInitialPlayerResponse
1131 # 1. https://github.com/ytdl-org/youtube-dl/issues/27093
1132 # 2. https://github.com/ytdl-org/youtube-dl/issues/27216
1133 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no',
1134 'info_dict': {
1135 'id': 'CHqg6qOn4no',
1136 'ext': 'mp4',
1137 'title': 'Part 77 Sort a list of simple types in c#',
1138 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc',
1139 'upload_date': '20130831',
1140 'uploader_id': 'kudvenkat',
1141 'uploader': 'kudvenkat',
1142 },
1143 'params': {
1144 'skip_download': True,
1145 },
1146 },
1147 {
1148 # another example of '};' in ytInitialData
1149 'url': 'https://www.youtube.com/watch?v=gVfgbahppCY',
1150 'only_matching': True,
1151 },
1152 {
1153 'url': 'https://www.youtube.com/watch_popup?v=63RmMXCd_bQ',
1154 'only_matching': True,
1155 },
1156 ]
1157
1158 def __init__(self, *args, **kwargs):
1159 super(YoutubeIE, self).__init__(*args, **kwargs)
1160 self._player_cache = {}
1161
1162 def report_video_info_webpage_download(self, video_id):
1163 """Report attempt to download video info webpage."""
1164 self.to_screen('%s: Downloading video info webpage' % video_id)
1165
1166 def report_information_extraction(self, video_id):
1167 """Report attempt to extract video information."""
1168 self.to_screen('%s: Extracting video information' % video_id)
1169
1170 def report_unavailable_format(self, video_id, format):
1171 """Report extracted video URL."""
1172 self.to_screen('%s: Format %s not available' % (video_id, format))
1173
1174 def report_rtmp_download(self):
1175 """Indicate the download will use the RTMP protocol."""
1176 self.to_screen('RTMP download detected')
1177
1178 def _signature_cache_id(self, example_sig):
1179 """ Return a string representation of a signature """
1180 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
1181
1182 @classmethod
1183 def _extract_player_info(cls, player_url):
1184 for player_re in cls._PLAYER_INFO_RE:
1185 id_m = re.search(player_re, player_url)
1186 if id_m:
1187 break
1188 else:
1189 raise ExtractorError('Cannot identify player %r' % player_url)
1190 return id_m.group('ext'), id_m.group('id')
1191
1192 def _extract_signature_function(self, video_id, player_url, example_sig):
1193 player_type, player_id = self._extract_player_info(player_url)
1194
1195 # Read from filesystem cache
1196 func_id = '%s_%s_%s' % (
1197 player_type, player_id, self._signature_cache_id(example_sig))
1198 assert os.path.basename(func_id) == func_id
1199
1200 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
1201 if cache_spec is not None:
1202 return lambda s: ''.join(s[i] for i in cache_spec)
1203
1204 download_note = (
1205 'Downloading player %s' % player_url
1206 if self._downloader.params.get('verbose') else
1207 'Downloading %s player %s' % (player_type, player_id)
1208 )
1209 if player_type == 'js':
1210 code = self._download_webpage(
1211 player_url, video_id,
1212 note=download_note,
1213 errnote='Download of %s failed' % player_url)
1214 res = self._parse_sig_js(code)
1215 elif player_type == 'swf':
1216 urlh = self._request_webpage(
1217 player_url, video_id,
1218 note=download_note,
1219 errnote='Download of %s failed' % player_url)
1220 code = urlh.read()
1221 res = self._parse_sig_swf(code)
1222 else:
1223 assert False, 'Invalid player type %r' % player_type
1224
1225 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1226 cache_res = res(test_string)
1227 cache_spec = [ord(c) for c in cache_res]
1228
1229 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
1230 return res
1231
1232 def _print_sig_code(self, func, example_sig):
1233 def gen_sig_code(idxs):
1234 def _genslice(start, end, step):
1235 starts = '' if start == 0 else str(start)
1236 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
1237 steps = '' if step == 1 else (':%d' % step)
1238 return 's[%s%s%s]' % (starts, ends, steps)
1239
1240 step = None
1241 # Quelch pyflakes warnings - start will be set when step is set
1242 start = '(Never used)'
1243 for i, prev in zip(idxs[1:], idxs[:-1]):
1244 if step is not None:
1245 if i - prev == step:
1246 continue
1247 yield _genslice(start, prev, step)
1248 step = None
1249 continue
1250 if i - prev in [-1, 1]:
1251 step = i - prev
1252 start = prev
1253 continue
1254 else:
1255 yield 's[%d]' % prev
1256 if step is None:
1257 yield 's[%d]' % i
1258 else:
1259 yield _genslice(start, i, step)
1260
1261 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1262 cache_res = func(test_string)
1263 cache_spec = [ord(c) for c in cache_res]
1264 expr_code = ' + '.join(gen_sig_code(cache_spec))
1265 signature_id_tuple = '(%s)' % (
1266 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
1267 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
1268 ' return %s\n') % (signature_id_tuple, expr_code)
1269 self.to_screen('Extracted signature function:\n' + code)
1270
1271 def _parse_sig_js(self, jscode):
1272 funcname = self._search_regex(
1273 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1274 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1275 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1276 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1277 # Obsolete patterns
1278 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1279 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
1280 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1281 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1282 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1283 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1284 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1285 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
1286 jscode, 'Initial JS player signature function name', group='sig')
1287
1288 jsi = JSInterpreter(jscode)
1289 initial_function = jsi.extract_function(funcname)
1290 return lambda s: initial_function([s])
1291
1292 def _parse_sig_swf(self, file_contents):
1293 swfi = SWFInterpreter(file_contents)
1294 TARGET_CLASSNAME = 'SignatureDecipher'
1295 searched_class = swfi.extract_class(TARGET_CLASSNAME)
1296 initial_function = swfi.extract_function(searched_class, 'decipher')
1297 return lambda s: initial_function([s])
1298
1299 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1300 """Turn the encrypted s field into a working signature"""
1301
1302 if player_url is None:
1303 raise ExtractorError('Cannot decrypt signature without player_url')
1304
1305 if player_url.startswith('//'):
1306 player_url = 'https:' + player_url
1307 elif not re.match(r'https?://', player_url):
1308 player_url = compat_urlparse.urljoin(
1309 'https://www.youtube.com', player_url)
1310 try:
1311 player_id = (player_url, self._signature_cache_id(s))
1312 if player_id not in self._player_cache:
1313 func = self._extract_signature_function(
1314 video_id, player_url, s
1315 )
1316 self._player_cache[player_id] = func
1317 func = self._player_cache[player_id]
1318 if self._downloader.params.get('youtube_print_sig_code'):
1319 self._print_sig_code(func, s)
1320 return func(s)
1321 except Exception as e:
1322 tb = traceback.format_exc()
1323 raise ExtractorError(
1324 'Signature extraction failed: ' + tb, cause=e)
1325
1326 def _get_subtitles(self, video_id, webpage, has_live_chat_replay):
1327 try:
1328 subs_doc = self._download_xml(
1329 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1330 video_id, note=False)
1331 except ExtractorError as err:
1332 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
1333 return {}
1334
1335 sub_lang_list = {}
1336 for track in subs_doc.findall('track'):
1337 lang = track.attrib['lang_code']
1338 if lang in sub_lang_list:
1339 continue
1340 sub_formats = []
1341 for ext in self._SUBTITLE_FORMATS:
1342 params = compat_urllib_parse_urlencode({
1343 'lang': lang,
1344 'v': video_id,
1345 'fmt': ext,
1346 'name': track.attrib['name'].encode('utf-8'),
1347 })
1348 sub_formats.append({
1349 'url': 'https://www.youtube.com/api/timedtext?' + params,
1350 'ext': ext,
1351 })
1352 sub_lang_list[lang] = sub_formats
1353 if has_live_chat_replay:
1354 sub_lang_list['live_chat'] = [
1355 {
1356 'video_id': video_id,
1357 'ext': 'json',
1358 'protocol': 'youtube_live_chat_replay',
1359 },
1360 ]
1361 if not sub_lang_list:
1362 self._downloader.report_warning('video doesn\'t have subtitles')
1363 return {}
1364 return sub_lang_list
1365
1366 def _get_ytplayer_config(self, video_id, webpage):
1367 patterns = (
1368 # User data may contain arbitrary character sequences that may affect
1369 # JSON extraction with regex, e.g. when '};' is contained the second
1370 # regex won't capture the whole JSON. Yet working around by trying more
1371 # concrete regex first keeping in mind proper quoted string handling
1372 # to be implemented in future that will replace this workaround (see
1373 # https://github.com/ytdl-org/youtube-dl/issues/7468,
1374 # https://github.com/ytdl-org/youtube-dl/pull/7599)
1375 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1376 r';ytplayer\.config\s*=\s*({.+?});',
1377 )
1378 config = self._search_regex(
1379 patterns, webpage, 'ytplayer.config', default=None)
1380 if config:
1381 return self._parse_json(
1382 uppercase_escape(config), video_id, fatal=False)
1383
1384 def _get_automatic_captions(self, video_id, player_response, player_config):
1385 """We need the webpage for getting the captions url, pass it as an
1386 argument to speed up the process."""
1387 self.to_screen('%s: Looking for automatic captions' % video_id)
1388 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
1389 if not (player_response or player_config):
1390 self._downloader.report_warning(err_msg)
1391 return {}
1392 try:
1393 args = player_config.get('args') if player_config else {}
1394 caption_url = args.get('ttsurl')
1395 if caption_url:
1396 timestamp = args['timestamp']
1397 # We get the available subtitles
1398 list_params = compat_urllib_parse_urlencode({
1399 'type': 'list',
1400 'tlangs': 1,
1401 'asrs': 1,
1402 })
1403 list_url = caption_url + '&' + list_params
1404 caption_list = self._download_xml(list_url, video_id)
1405 original_lang_node = caption_list.find('track')
1406 if original_lang_node is None:
1407 self._downloader.report_warning('Video doesn\'t have automatic captions')
1408 return {}
1409 original_lang = original_lang_node.attrib['lang_code']
1410 caption_kind = original_lang_node.attrib.get('kind', '')
1411
1412 sub_lang_list = {}
1413 for lang_node in caption_list.findall('target'):
1414 sub_lang = lang_node.attrib['lang_code']
1415 sub_formats = []
1416 for ext in self._SUBTITLE_FORMATS:
1417 params = compat_urllib_parse_urlencode({
1418 'lang': original_lang,
1419 'tlang': sub_lang,
1420 'fmt': ext,
1421 'ts': timestamp,
1422 'kind': caption_kind,
1423 })
1424 sub_formats.append({
1425 'url': caption_url + '&' + params,
1426 'ext': ext,
1427 })
1428 sub_lang_list[sub_lang] = sub_formats
1429 return sub_lang_list
1430
1431 def make_captions(sub_url, sub_langs):
1432 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1433 caption_qs = compat_parse_qs(parsed_sub_url.query)
1434 captions = {}
1435 for sub_lang in sub_langs:
1436 sub_formats = []
1437 for ext in self._SUBTITLE_FORMATS:
1438 caption_qs.update({
1439 'tlang': [sub_lang],
1440 'fmt': [ext],
1441 })
1442 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1443 query=compat_urllib_parse_urlencode(caption_qs, True)))
1444 sub_formats.append({
1445 'url': sub_url,
1446 'ext': ext,
1447 })
1448 captions[sub_lang] = sub_formats
1449 return captions
1450
1451 # New captions format as of 22.06.2017
1452 if player_response:
1453 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1454 base_url = renderer['captionTracks'][0]['baseUrl']
1455 sub_lang_list = []
1456 for lang in renderer['translationLanguages']:
1457 lang_code = lang.get('languageCode')
1458 if lang_code:
1459 sub_lang_list.append(lang_code)
1460 return make_captions(base_url, sub_lang_list)
1461
1462 # Some videos don't provide ttsurl but rather caption_tracks and
1463 # caption_translation_languages (e.g. 20LmZk1hakA)
1464 # Does not used anymore as of 22.06.2017
1465 caption_tracks = args['caption_tracks']
1466 caption_translation_languages = args['caption_translation_languages']
1467 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
1468 sub_lang_list = []
1469 for lang in caption_translation_languages.split(','):
1470 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1471 sub_lang = lang_qs.get('lc', [None])[0]
1472 if sub_lang:
1473 sub_lang_list.append(sub_lang)
1474 return make_captions(caption_url, sub_lang_list)
1475 # An extractor error can be raise by the download process if there are
1476 # no automatic captions but there are subtitles
1477 except (KeyError, IndexError, ExtractorError):
1478 self._downloader.report_warning(err_msg)
1479 return {}
1480
1481 def _mark_watched(self, video_id, video_info, player_response):
1482 playback_url = url_or_none(try_get(
1483 player_response,
1484 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
1485 video_info, lambda x: x['videostats_playback_base_url'][0]))
1486 if not playback_url:
1487 return
1488 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1489 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1490
1491 # cpn generation algorithm is reverse engineered from base.js.
1492 # In fact it works even with dummy cpn.
1493 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1494 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1495
1496 qs.update({
1497 'ver': ['2'],
1498 'cpn': [cpn],
1499 })
1500 playback_url = compat_urlparse.urlunparse(
1501 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
1502
1503 self._download_webpage(
1504 playback_url, video_id, 'Marking watched',
1505 'Unable to mark watched', fatal=False)
1506
1507 @staticmethod
1508 def _extract_urls(webpage):
1509 # Embedded YouTube player
1510 entries = [
1511 unescapeHTML(mobj.group('url'))
1512 for mobj in re.finditer(r'''(?x)
1513 (?:
1514 <iframe[^>]+?src=|
1515 data-video-url=|
1516 <embed[^>]+?src=|
1517 embedSWF\(?:\s*|
1518 <object[^>]+data=|
1519 new\s+SWFObject\(
1520 )
1521 (["\'])
1522 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1523 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
1524 \1''', webpage)]
1525
1526 # lazyYT YouTube embed
1527 entries.extend(list(map(
1528 unescapeHTML,
1529 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1530
1531 # Wordpress "YouTube Video Importer" plugin
1532 matches = re.findall(r'''(?x)<div[^>]+
1533 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1534 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1535 entries.extend(m[-1] for m in matches)
1536
1537 return entries
1538
1539 @staticmethod
1540 def _extract_url(webpage):
1541 urls = YoutubeIE._extract_urls(webpage)
1542 return urls[0] if urls else None
1543
1544 @classmethod
1545 def extract_id(cls, url):
1546 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1547 if mobj is None:
1548 raise ExtractorError('Invalid URL: %s' % url)
1549 video_id = mobj.group(2)
1550 return video_id
1551
1552 def _extract_chapters_from_json(self, webpage, video_id, duration):
1553 if not webpage:
1554 return
1555 data = self._extract_yt_initial_data(video_id, webpage)
1556 if not data or not isinstance(data, dict):
1557 return
1558 chapters_list = try_get(
1559 data,
1560 lambda x: x['playerOverlays']
1561 ['playerOverlayRenderer']
1562 ['decoratedPlayerBarRenderer']
1563 ['decoratedPlayerBarRenderer']
1564 ['playerBar']
1565 ['chapteredPlayerBarRenderer']
1566 ['chapters'],
1567 list)
1568 if not chapters_list:
1569 return
1570
1571 def chapter_time(chapter):
1572 return float_or_none(
1573 try_get(
1574 chapter,
1575 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1576 int),
1577 scale=1000)
1578 chapters = []
1579 for next_num, chapter in enumerate(chapters_list, start=1):
1580 start_time = chapter_time(chapter)
1581 if start_time is None:
1582 continue
1583 end_time = (chapter_time(chapters_list[next_num])
1584 if next_num < len(chapters_list) else duration)
1585 if end_time is None:
1586 continue
1587 title = try_get(
1588 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1589 compat_str)
1590 chapters.append({
1591 'start_time': start_time,
1592 'end_time': end_time,
1593 'title': title,
1594 })
1595 return chapters
1596
1597 @staticmethod
1598 def _extract_chapters_from_description(description, duration):
1599 if not description:
1600 return None
1601 chapter_lines = re.findall(
1602 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1603 description)
1604 if not chapter_lines:
1605 return None
1606 chapters = []
1607 for next_num, (chapter_line, time_point) in enumerate(
1608 chapter_lines, start=1):
1609 start_time = parse_duration(time_point)
1610 if start_time is None:
1611 continue
1612 if start_time > duration:
1613 break
1614 end_time = (duration if next_num == len(chapter_lines)
1615 else parse_duration(chapter_lines[next_num][1]))
1616 if end_time is None:
1617 continue
1618 if end_time > duration:
1619 end_time = duration
1620 if start_time > end_time:
1621 break
1622 chapter_title = re.sub(
1623 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1624 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1625 chapters.append({
1626 'start_time': start_time,
1627 'end_time': end_time,
1628 'title': chapter_title,
1629 })
1630 return chapters
1631
1632 def _extract_chapters(self, webpage, description, video_id, duration):
1633 return (self._extract_chapters_from_json(webpage, video_id, duration)
1634 or self._extract_chapters_from_description(description, duration))
1635
1636 def _real_extract(self, url):
1637 url, smuggled_data = unsmuggle_url(url, {})
1638
1639 proto = (
1640 'http' if self._downloader.params.get('prefer_insecure', False)
1641 else 'https')
1642
1643 start_time = None
1644 end_time = None
1645 parsed_url = compat_urllib_parse_urlparse(url)
1646 for component in [parsed_url.fragment, parsed_url.query]:
1647 query = compat_parse_qs(component)
1648 if start_time is None and 't' in query:
1649 start_time = parse_duration(query['t'][0])
1650 if start_time is None and 'start' in query:
1651 start_time = parse_duration(query['start'][0])
1652 if end_time is None and 'end' in query:
1653 end_time = parse_duration(query['end'][0])
1654
1655 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1656 mobj = re.search(self._NEXT_URL_RE, url)
1657 if mobj:
1658 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
1659 video_id = self.extract_id(url)
1660
1661 # Get video webpage
1662 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
1663 video_webpage, urlh = self._download_webpage_handle(url, video_id)
1664
1665 qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query)
1666 video_id = qs.get('v', [None])[0] or video_id
1667
1668 # Attempt to extract SWF player URL
1669 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1670 if mobj is not None:
1671 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1672 else:
1673 player_url = None
1674
1675 dash_mpds = []
1676
1677 def add_dash_mpd(video_info):
1678 dash_mpd = video_info.get('dashmpd')
1679 if dash_mpd and dash_mpd[0] not in dash_mpds:
1680 dash_mpds.append(dash_mpd[0])
1681
1682 def add_dash_mpd_pr(pl_response):
1683 dash_mpd = url_or_none(try_get(
1684 pl_response, lambda x: x['streamingData']['dashManifestUrl'],
1685 compat_str))
1686 if dash_mpd and dash_mpd not in dash_mpds:
1687 dash_mpds.append(dash_mpd)
1688
1689 is_live = None
1690 view_count = None
1691
1692 def extract_view_count(v_info):
1693 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1694
1695 def extract_player_response(player_response, video_id):
1696 pl_response = str_or_none(player_response)
1697 if not pl_response:
1698 return
1699 pl_response = self._parse_json(pl_response, video_id, fatal=False)
1700 if isinstance(pl_response, dict):
1701 add_dash_mpd_pr(pl_response)
1702 return pl_response
1703
1704 def extract_embedded_config(embed_webpage, video_id):
1705 embedded_config = self._search_regex(
1706 r'setConfig\(({.*})\);',
1707 embed_webpage, 'ytInitialData', default=None)
1708 if embedded_config:
1709 return embedded_config
1710
1711 video_info = {}
1712 player_response = {}
1713 ytplayer_config = None
1714 embed_webpage = None
1715
1716 # Get video info
1717 if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+'
1718 or re.search(r'player-age-gate-content">', video_webpage) is not None):
1719 cookie_keys = self._get_cookies('https://www.youtube.com').keys()
1720 age_gate = True
1721 # We simulate the access to the video from www.youtube.com/v/{video_id}
1722 # this can be viewed without login into Youtube
1723 url = proto + '://www.youtube.com/embed/%s' % video_id
1724 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
1725 ext = extract_embedded_config(embed_webpage, video_id)
1726 # playabilityStatus = re.search(r'{\\\"status\\\":\\\"(?P<playabilityStatus>[^\"]+)\\\"', ext)
1727 playable_in_embed = re.search(r'{\\\"playableInEmbed\\\":(?P<playableinEmbed>[^\,]+)', ext)
1728 if not playable_in_embed:
1729 self.to_screen('Could not determine whether playabale in embed for video %s' % video_id)
1730 playable_in_embed = ''
1731 else:
1732 playable_in_embed = playable_in_embed.group('playableinEmbed')
1733 # check if video is only playable on youtube in other words not playable in embed - if so it requires auth (cookies)
1734 # if re.search(r'player-unavailable">', embed_webpage) is not None:
1735 if playable_in_embed == 'false':
1736 '''
1737 # TODO apply this patch when Support for Python 2.6(!) and above drops
1738 if ({'VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID'} <= cookie_keys
1739 or {'VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO'} <= cookie_keys):
1740 '''
1741 if (set(('VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID')) <= set(cookie_keys)
1742 or set(('VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO')) <= set(cookie_keys)):
1743 age_gate = False
1744 # Try looking directly into the video webpage
1745 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1746 if ytplayer_config:
1747 args = ytplayer_config.get("args")
1748 if args is not None:
1749 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1750 # Convert to the same format returned by compat_parse_qs
1751 video_info = dict((k, [v]) for k, v in args.items())
1752 add_dash_mpd(video_info)
1753 # Rental video is not rented but preview is available (e.g.
1754 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1755 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1756 if not video_info and args.get('ypc_vid'):
1757 return self.url_result(
1758 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1759 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1760 is_live = True
1761 if not player_response:
1762 player_response = extract_player_response(args.get('player_response'), video_id)
1763 elif not player_response:
1764 player_response = ytplayer_config
1765 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1766 add_dash_mpd_pr(player_response)
1767 else:
1768 raise ExtractorError('Video is age restricted and only playable on Youtube. Requires cookies!', expected=True)
1769 else:
1770 data = compat_urllib_parse_urlencode({
1771 'video_id': video_id,
1772 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1773 'sts': self._search_regex(
1774 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1775 })
1776 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1777 try:
1778 video_info_webpage = self._download_webpage(
1779 video_info_url, video_id,
1780 note='Refetching age-gated info webpage',
1781 errnote='unable to download video info webpage')
1782 except ExtractorError:
1783 video_info_webpage = None
1784 if video_info_webpage:
1785 video_info = compat_parse_qs(video_info_webpage)
1786 pl_response = video_info.get('player_response', [None])[0]
1787 player_response = extract_player_response(pl_response, video_id)
1788 add_dash_mpd(video_info)
1789 view_count = extract_view_count(video_info)
1790 else:
1791 age_gate = False
1792 # Try looking directly into the video webpage
1793 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1794 if ytplayer_config:
1795 args = ytplayer_config.get('args', {})
1796 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1797 # Convert to the same format returned by compat_parse_qs
1798 video_info = dict((k, [v]) for k, v in args.items())
1799 add_dash_mpd(video_info)
1800 # Rental video is not rented but preview is available (e.g.
1801 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1802 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1803 if not video_info and args.get('ypc_vid'):
1804 return self.url_result(
1805 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1806 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1807 is_live = True
1808 if not player_response:
1809 player_response = extract_player_response(args.get('player_response'), video_id)
1810 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1811 add_dash_mpd_pr(player_response)
1812
1813 if not video_info and not player_response:
1814 player_response = extract_player_response(
1815 self._search_regex(
1816 (r'%s\s*%s' % (self._YT_INITIAL_PLAYER_RESPONSE_RE, self._YT_INITIAL_BOUNDARY_RE),
1817 self._YT_INITIAL_PLAYER_RESPONSE_RE), video_webpage,
1818 'initial player response', default='{}'),
1819 video_id)
1820
1821 def extract_unavailable_message():
1822 messages = []
1823 for tag, kind in (('h1', 'message'), ('div', 'submessage')):
1824 msg = self._html_search_regex(
1825 r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
1826 video_webpage, 'unavailable %s' % kind, default=None)
1827 if msg:
1828 messages.append(msg)
1829 if messages:
1830 return '\n'.join(messages)
1831
1832 if not video_info and not player_response:
1833 unavailable_message = extract_unavailable_message()
1834 if not unavailable_message:
1835 unavailable_message = 'Unable to extract video data'
1836 raise ExtractorError(
1837 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
1838
1839 if not isinstance(video_info, dict):
1840 video_info = {}
1841
1842 playable_in_embed = try_get(
1843 player_response, lambda x: x['playabilityStatus']['playableInEmbed'])
1844
1845 video_details = try_get(
1846 player_response, lambda x: x['videoDetails'], dict) or {}
1847
1848 microformat = try_get(
1849 player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}
1850
1851 video_title = video_info.get('title', [None])[0] or video_details.get('title')
1852 if not video_title:
1853 self._downloader.report_warning('Unable to extract video title')
1854 video_title = '_'
1855
1856 description_original = video_description = get_element_by_id("eow-description", video_webpage)
1857 if video_description:
1858
1859 def replace_url(m):
1860 redir_url = compat_urlparse.urljoin(url, m.group(1))
1861 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1862 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1863 qs = compat_parse_qs(parsed_redir_url.query)
1864 q = qs.get('q')
1865 if q and q[0]:
1866 return q[0]
1867 return redir_url
1868
1869 description_original = video_description = re.sub(r'''(?x)
1870 <a\s+
1871 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1872 (?:title|href)="([^"]+)"\s+
1873 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1874 class="[^"]*"[^>]*>
1875 [^<]+\.{3}\s*
1876 </a>
1877 ''', replace_url, video_description)
1878 video_description = clean_html(video_description)
1879 else:
1880 video_description = video_details.get('shortDescription')
1881 if video_description is None:
1882 video_description = self._html_search_meta('description', video_webpage)
1883
1884 if not smuggled_data.get('force_singlefeed', False):
1885 if not self._downloader.params.get('noplaylist'):
1886 multifeed_metadata_list = try_get(
1887 player_response,
1888 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
1889 compat_str) or try_get(
1890 video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
1891 if multifeed_metadata_list:
1892 entries = []
1893 feed_ids = []
1894 for feed in multifeed_metadata_list.split(','):
1895 # Unquote should take place before split on comma (,) since textual
1896 # fields may contain comma as well (see
1897 # https://github.com/ytdl-org/youtube-dl/issues/8536)
1898 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
1899
1900 def feed_entry(name):
1901 return try_get(feed_data, lambda x: x[name][0], compat_str)
1902
1903 feed_id = feed_entry('id')
1904 if not feed_id:
1905 continue
1906 feed_title = feed_entry('title')
1907 title = video_title
1908 if feed_title:
1909 title += ' (%s)' % feed_title
1910 entries.append({
1911 '_type': 'url_transparent',
1912 'ie_key': 'Youtube',
1913 'url': smuggle_url(
1914 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1915 {'force_singlefeed': True}),
1916 'title': title,
1917 })
1918 feed_ids.append(feed_id)
1919 self.to_screen(
1920 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1921 % (', '.join(feed_ids), video_id))
1922 return self.playlist_result(entries, video_id, video_title, video_description)
1923 else:
1924 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1925
1926 if view_count is None:
1927 view_count = extract_view_count(video_info)
1928 if view_count is None and video_details:
1929 view_count = int_or_none(video_details.get('viewCount'))
1930 if view_count is None and microformat:
1931 view_count = int_or_none(microformat.get('viewCount'))
1932
1933 if is_live is None:
1934 is_live = bool_or_none(video_details.get('isLive'))
1935
1936 has_live_chat_replay = False
1937 if not is_live:
1938 yt_initial_data = self._extract_yt_initial_data(video_id, video_webpage)
1939 try:
1940 yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
1941 has_live_chat_replay = True
1942 except (KeyError, IndexError, TypeError):
1943 pass
1944
1945 # Check for "rental" videos
1946 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1947 raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
1948
1949 def _extract_filesize(media_url):
1950 return int_or_none(self._search_regex(
1951 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
1952
1953 streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
1954 streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
1955
1956 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1957 self.report_rtmp_download()
1958 formats = [{
1959 'format_id': '_rtmp',
1960 'protocol': 'rtmp',
1961 'url': video_info['conn'][0],
1962 'player_url': player_url,
1963 }]
1964 elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
1965 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
1966 if 'rtmpe%3Dyes' in encoded_url_map:
1967 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
1968 formats = []
1969 formats_spec = {}
1970 fmt_list = video_info.get('fmt_list', [''])[0]
1971 if fmt_list:
1972 for fmt in fmt_list.split(','):
1973 spec = fmt.split('/')
1974 if len(spec) > 1:
1975 width_height = spec[1].split('x')
1976 if len(width_height) == 2:
1977 formats_spec[spec[0]] = {
1978 'resolution': spec[1],
1979 'width': int_or_none(width_height[0]),
1980 'height': int_or_none(width_height[1]),
1981 }
1982 for fmt in streaming_formats:
1983 itag = str_or_none(fmt.get('itag'))
1984 if not itag:
1985 continue
1986 quality = fmt.get('quality')
1987 quality_label = fmt.get('qualityLabel') or quality
1988 formats_spec[itag] = {
1989 'asr': int_or_none(fmt.get('audioSampleRate')),
1990 'filesize': int_or_none(fmt.get('contentLength')),
1991 'format_note': quality_label,
1992 'fps': int_or_none(fmt.get('fps')),
1993 'height': int_or_none(fmt.get('height')),
1994 # bitrate for itag 43 is always 2147483647
1995 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
1996 'width': int_or_none(fmt.get('width')),
1997 }
1998
1999 for fmt in streaming_formats:
2000 if fmt.get('drmFamilies') or fmt.get('drm_families'):
2001 continue
2002 url = url_or_none(fmt.get('url'))
2003
2004 if not url:
2005 cipher = fmt.get('cipher') or fmt.get('signatureCipher')
2006 if not cipher:
2007 continue
2008 url_data = compat_parse_qs(cipher)
2009 url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
2010 if not url:
2011 continue
2012 else:
2013 cipher = None
2014 url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2015
2016 stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
2017 # Unsupported FORMAT_STREAM_TYPE_OTF
2018 if stream_type == 3:
2019 continue
2020
2021 format_id = fmt.get('itag') or url_data['itag'][0]
2022 if not format_id:
2023 continue
2024 format_id = compat_str(format_id)
2025
2026 if cipher:
2027 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
2028 ASSETS_RE = (
2029 r'<script[^>]+\bsrc=("[^"]+")[^>]+\bname=["\']player_ias/base',
2030 r'"jsUrl"\s*:\s*("[^"]+")',
2031 r'"assets":.+?"js":\s*("[^"]+")')
2032 jsplayer_url_json = self._search_regex(
2033 ASSETS_RE,
2034 embed_webpage if age_gate else video_webpage,
2035 'JS player URL (1)', default=None)
2036 if not jsplayer_url_json and not age_gate:
2037 # We need the embed website after all
2038 if embed_webpage is None:
2039 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
2040 embed_webpage = self._download_webpage(
2041 embed_url, video_id, 'Downloading embed webpage')
2042 jsplayer_url_json = self._search_regex(
2043 ASSETS_RE, embed_webpage, 'JS player URL')
2044
2045 player_url = json.loads(jsplayer_url_json)
2046 if player_url is None:
2047 player_url_json = self._search_regex(
2048 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
2049 video_webpage, 'age gate player URL')
2050 player_url = json.loads(player_url_json)
2051
2052 if 'sig' in url_data:
2053 url += '&signature=' + url_data['sig'][0]
2054 elif 's' in url_data:
2055 encrypted_sig = url_data['s'][0]
2056
2057 if self._downloader.params.get('verbose'):
2058 if player_url is None:
2059 player_desc = 'unknown'
2060 else:
2061 player_type, player_version = self._extract_player_info(player_url)
2062 player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version)
2063 parts_sizes = self._signature_cache_id(encrypted_sig)
2064 self.to_screen('{%s} signature length %s, %s' %
2065 (format_id, parts_sizes, player_desc))
2066
2067 signature = self._decrypt_signature(
2068 encrypted_sig, video_id, player_url, age_gate)
2069 sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
2070 url += '&%s=%s' % (sp, signature)
2071 if 'ratebypass' not in url:
2072 url += '&ratebypass=yes'
2073
2074 dct = {
2075 'format_id': format_id,
2076 'url': url,
2077 'player_url': player_url,
2078 }
2079 if format_id in self._formats:
2080 dct.update(self._formats[format_id])
2081 if format_id in formats_spec:
2082 dct.update(formats_spec[format_id])
2083
2084 # Some itags are not included in DASH manifest thus corresponding formats will
2085 # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).
2086 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
2087 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
2088 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
2089
2090 if width is None:
2091 width = int_or_none(fmt.get('width'))
2092 if height is None:
2093 height = int_or_none(fmt.get('height'))
2094
2095 filesize = int_or_none(url_data.get(
2096 'clen', [None])[0]) or _extract_filesize(url)
2097
2098 quality = url_data.get('quality', [None])[0] or fmt.get('quality')
2099 quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
2100
2101 tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
2102 or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
2103 fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
2104
2105 more_fields = {
2106 'filesize': filesize,
2107 'tbr': tbr,
2108 'width': width,
2109 'height': height,
2110 'fps': fps,
2111 'format_note': quality_label or quality,
2112 }
2113 for key, value in more_fields.items():
2114 if value:
2115 dct[key] = value
2116 type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
2117 if type_:
2118 type_split = type_.split(';')
2119 kind_ext = type_split[0].split('/')
2120 if len(kind_ext) == 2:
2121 kind, _ = kind_ext
2122 dct['ext'] = mimetype2ext(type_split[0])
2123 if kind in ('audio', 'video'):
2124 codecs = None
2125 for mobj in re.finditer(
2126 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
2127 if mobj.group('key') == 'codecs':
2128 codecs = mobj.group('val')
2129 break
2130 if codecs:
2131 dct.update(parse_codecs(codecs))
2132 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
2133 dct['downloader_options'] = {
2134 # Youtube throttles chunks >~10M
2135 'http_chunk_size': 10485760,
2136 }
2137 formats.append(dct)
2138 else:
2139 manifest_url = (
2140 url_or_none(try_get(
2141 player_response,
2142 lambda x: x['streamingData']['hlsManifestUrl'],
2143 compat_str))
2144 or url_or_none(try_get(
2145 video_info, lambda x: x['hlsvp'][0], compat_str)))
2146 if manifest_url:
2147 formats = []
2148 m3u8_formats = self._extract_m3u8_formats(
2149 manifest_url, video_id, 'mp4', fatal=False)
2150 for a_format in m3u8_formats:
2151 itag = self._search_regex(
2152 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
2153 if itag:
2154 a_format['format_id'] = itag
2155 if itag in self._formats:
2156 dct = self._formats[itag].copy()
2157 dct.update(a_format)
2158 a_format = dct
2159 a_format['player_url'] = player_url
2160 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
2161 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
2162 if self._downloader.params.get('youtube_include_hls_manifest', True):
2163 formats.append(a_format)
2164 else:
2165 error_message = extract_unavailable_message()
2166 if not error_message:
2167 reason_list = try_get(
2168 player_response,
2169 lambda x: x['playabilityStatus']['errorScreen']['playerErrorMessageRenderer']['subreason']['runs'],
2170 list) or []
2171 for reason in reason_list:
2172 if not isinstance(reason, dict):
2173 continue
2174 reason_text = try_get(reason, lambda x: x['text'], compat_str)
2175 if reason_text:
2176 if not error_message:
2177 error_message = ''
2178 error_message += reason_text
2179 if error_message:
2180 error_message = clean_html(error_message)
2181 if not error_message:
2182 error_message = clean_html(try_get(
2183 player_response, lambda x: x['playabilityStatus']['reason'],
2184 compat_str))
2185 if not error_message:
2186 error_message = clean_html(
2187 try_get(video_info, lambda x: x['reason'][0], compat_str))
2188 if error_message:
2189 raise ExtractorError(error_message, expected=True)
2190 raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
2191
2192 # uploader
2193 video_uploader = try_get(
2194 video_info, lambda x: x['author'][0],
2195 compat_str) or str_or_none(video_details.get('author'))
2196 if video_uploader:
2197 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
2198 else:
2199 self._downloader.report_warning('unable to extract uploader name')
2200
2201 # uploader_id
2202 video_uploader_id = None
2203 video_uploader_url = None
2204 mobj = re.search(
2205 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
2206 video_webpage)
2207 if mobj is not None:
2208 video_uploader_id = mobj.group('uploader_id')
2209 video_uploader_url = mobj.group('uploader_url')
2210 else:
2211 owner_profile_url = url_or_none(microformat.get('ownerProfileUrl'))
2212 if owner_profile_url:
2213 video_uploader_id = self._search_regex(
2214 r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id',
2215 default=None)
2216 video_uploader_url = owner_profile_url
2217
2218 channel_id = (
2219 str_or_none(video_details.get('channelId'))
2220 or self._html_search_meta(
2221 'channelId', video_webpage, 'channel id', default=None)
2222 or self._search_regex(
2223 r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
2224 video_webpage, 'channel id', default=None, group='id'))
2225 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
2226
2227 thumbnails = []
2228 thumbnails_list = try_get(
2229 video_details, lambda x: x['thumbnail']['thumbnails'], list) or []
2230 for t in thumbnails_list:
2231 if not isinstance(t, dict):
2232 continue
2233 thumbnail_url = url_or_none(t.get('url'))
2234 if not thumbnail_url:
2235 continue
2236 thumbnails.append({
2237 'url': thumbnail_url,
2238 'width': int_or_none(t.get('width')),
2239 'height': int_or_none(t.get('height')),
2240 })
2241
2242 if not thumbnails:
2243 video_thumbnail = None
2244 # We try first to get a high quality image:
2245 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
2246 video_webpage, re.DOTALL)
2247 if m_thumb is not None:
2248 video_thumbnail = m_thumb.group(1)
2249 thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str)
2250 if thumbnail_url:
2251 video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url)
2252 if video_thumbnail:
2253 thumbnails.append({'url': video_thumbnail})
2254
2255 # upload date
2256 upload_date = self._html_search_meta(
2257 'datePublished', video_webpage, 'upload date', default=None)
2258 if not upload_date:
2259 upload_date = self._search_regex(
2260 [r'(?s)id="eow-date.*?>(.*?)</span>',
2261 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
2262 video_webpage, 'upload date', default=None)
2263 if not upload_date:
2264 upload_date = microformat.get('publishDate') or microformat.get('uploadDate')
2265 upload_date = unified_strdate(upload_date)
2266
2267 video_license = self._html_search_regex(
2268 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
2269 video_webpage, 'license', default=None)
2270
2271 m_music = re.search(
2272 r'''(?x)
2273 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
2274 <ul[^>]*>\s*
2275 <li>(?P<title>.+?)
2276 by (?P<creator>.+?)
2277 (?:
2278 \(.+?\)|
2279 <a[^>]*
2280 (?:
2281 \bhref=["\']/red[^>]*>| # drop possible
2282 >\s*Listen ad-free with YouTube Red # YouTube Red ad
2283 )
2284 .*?
2285 )?</li
2286 ''',
2287 video_webpage)
2288 if m_music:
2289 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
2290 video_creator = clean_html(m_music.group('creator'))
2291 else:
2292 video_alt_title = video_creator = None
2293
2294 def extract_meta(field):
2295 return self._html_search_regex(
2296 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
2297 video_webpage, field, default=None)
2298
2299 track = extract_meta('Song')
2300 artist = extract_meta('Artist')
2301 album = extract_meta('Album')
2302
2303 # Youtube Music Auto-generated description
2304 release_date = release_year = None
2305 if video_description:
2306 mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description)
2307 if mobj:
2308 if not track:
2309 track = mobj.group('track').strip()
2310 if not artist:
2311 artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))
2312 if not album:
2313 album = mobj.group('album'.strip())
2314 release_year = mobj.group('release_year')
2315 release_date = mobj.group('release_date')
2316 if release_date:
2317 release_date = release_date.replace('-', '')
2318 if not release_year:
2319 release_year = int(release_date[:4])
2320 if release_year:
2321 release_year = int(release_year)
2322
2323 yt_initial_data = self._extract_yt_initial_data(video_id, video_webpage)
2324 contents = try_get(yt_initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list) or []
2325 for content in contents:
2326 rows = try_get(content, lambda x: x['videoSecondaryInfoRenderer']['metadataRowContainer']['metadataRowContainerRenderer']['rows'], list) or []
2327 multiple_songs = False
2328 for row in rows:
2329 if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True:
2330 multiple_songs = True
2331 break
2332 for row in rows:
2333 mrr = row.get('metadataRowRenderer') or {}
2334 mrr_title = try_get(
2335 mrr, lambda x: x['title']['simpleText'], compat_str)
2336 mrr_contents = try_get(
2337 mrr, lambda x: x['contents'][0], dict) or {}
2338 mrr_contents_text = try_get(mrr_contents, [lambda x: x['simpleText'], lambda x: x['runs'][0]['text']], compat_str)
2339 if not (mrr_title and mrr_contents_text):
2340 continue
2341 if mrr_title == 'License':
2342 video_license = mrr_contents_text
2343 elif not multiple_songs:
2344 if mrr_title == 'Album':
2345 album = mrr_contents_text
2346 elif mrr_title == 'Artist':
2347 artist = mrr_contents_text
2348 elif mrr_title == 'Song':
2349 track = mrr_contents_text
2350
2351 m_episode = re.search(
2352 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
2353 video_webpage)
2354 if m_episode:
2355 series = unescapeHTML(m_episode.group('series'))
2356 season_number = int(m_episode.group('season'))
2357 episode_number = int(m_episode.group('episode'))
2358 else:
2359 series = season_number = episode_number = None
2360
2361 m_cat_container = self._search_regex(
2362 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
2363 video_webpage, 'categories', default=None)
2364 category = None
2365 if m_cat_container:
2366 category = self._html_search_regex(
2367 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
2368 default=None)
2369 if not category:
2370 category = try_get(
2371 microformat, lambda x: x['category'], compat_str)
2372 video_categories = None if category is None else [category]
2373
2374 video_tags = [
2375 unescapeHTML(m.group('content'))
2376 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
2377 if not video_tags:
2378 video_tags = try_get(video_details, lambda x: x['keywords'], list)
2379
2380 def _extract_count(count_name):
2381 return str_to_int(self._search_regex(
2382 (r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' % re.escape(count_name),
2383 r'["\']label["\']\s*:\s*["\']([\d,.]+)\s+%ss["\']' % re.escape(count_name)),
2384 video_webpage, count_name, default=None))
2385
2386 like_count = _extract_count('like')
2387 dislike_count = _extract_count('dislike')
2388
2389 if view_count is None:
2390 view_count = str_to_int(self._search_regex(
2391 r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
2392 'view count', default=None))
2393
2394 average_rating = (
2395 float_or_none(video_details.get('averageRating'))
2396 or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
2397
2398 # subtitles
2399 video_subtitles = self.extract_subtitles(
2400 video_id, video_webpage, has_live_chat_replay)
2401 automatic_captions = self.extract_automatic_captions(video_id, player_response, ytplayer_config)
2402
2403 video_duration = try_get(
2404 video_info, lambda x: int_or_none(x['length_seconds'][0]))
2405 if not video_duration:
2406 video_duration = int_or_none(video_details.get('lengthSeconds'))
2407 if not video_duration:
2408 video_duration = parse_duration(self._html_search_meta(
2409 'duration', video_webpage, 'video duration'))
2410
2411 # Get Subscriber Count of channel
2412 subscriber_count = parse_count(self._search_regex(
2413 r'"text":"([\d\.]+\w?) subscribers"',
2414 video_webpage,
2415 'subscriber count',
2416 default=None
2417 ))
2418
2419 # get xsrf for annotations or comments
2420 get_annotations = self._downloader.params.get('writeannotations', False)
2421 get_comments = self._downloader.params.get('getcomments', False)
2422 if get_annotations or get_comments:
2423 xsrf_token = None
2424 ytcfg = self._extract_ytcfg(video_id, video_webpage)
2425 if ytcfg:
2426 xsrf_token = try_get(ytcfg, lambda x: x['XSRF_TOKEN'], compat_str)
2427 if not xsrf_token:
2428 xsrf_token = self._search_regex(
2429 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>(?:(?!\2).)+)\2',
2430 video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
2431
2432 # annotations
2433 video_annotations = None
2434 if get_annotations:
2435 invideo_url = try_get(
2436 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
2437 if xsrf_token and invideo_url:
2438 xsrf_field_name = None
2439 if ytcfg:
2440 xsrf_field_name = try_get(ytcfg, lambda x: x['XSRF_FIELD_NAME'], compat_str)
2441 if not xsrf_field_name:
2442 xsrf_field_name = self._search_regex(
2443 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
2444 video_webpage, 'xsrf field name',
2445 group='xsrf_field_name', default='session_token')
2446 video_annotations = self._download_webpage(
2447 self._proto_relative_url(invideo_url),
2448 video_id, note='Downloading annotations',
2449 errnote='Unable to download video annotations', fatal=False,
2450 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
2451
2452 chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
2453
2454 # Get comments
2455 # TODO: Refactor and move to seperate function
2456 if get_comments:
2457 expected_video_comment_count = 0
2458 video_comments = []
2459
2460 def find_value(html, key, num_chars=2, separator='"'):
2461 pos_begin = html.find(key) + len(key) + num_chars
2462 pos_end = html.find(separator, pos_begin)
2463 return html[pos_begin: pos_end]
2464
2465 def search_dict(partial, key):
2466 if isinstance(partial, dict):
2467 for k, v in partial.items():
2468 if k == key:
2469 yield v
2470 else:
2471 for o in search_dict(v, key):
2472 yield o
2473 elif isinstance(partial, list):
2474 for i in partial:
2475 for o in search_dict(i, key):
2476 yield o
2477
2478 try:
2479 ncd = next(search_dict(yt_initial_data, 'nextContinuationData'))
2480 continuations = [ncd['continuation']]
2481 # Handle videos where comments have been disabled entirely
2482 except StopIteration:
2483 continuations = []
2484
2485 def get_continuation(continuation, session_token, replies=False):
2486 query = {
2487 'pbj': 1,
2488 'ctoken': continuation,
2489 }
2490 if replies:
2491 query['action_get_comment_replies'] = 1
2492 else:
2493 query['action_get_comments'] = 1
2494
2495 while True:
2496 content, handle = self._download_webpage_handle(
2497 'https://www.youtube.com/comment_service_ajax',
2498 video_id,
2499 note=False,
2500 expected_status=[413],
2501 data=urlencode_postdata({
2502 'session_token': session_token
2503 }),
2504 query=query,
2505 headers={
2506 'Accept': '*/*',
2507 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0',
2508 'X-YouTube-Client-Name': '1',
2509 'X-YouTube-Client-Version': '2.20201202.06.01'
2510 }
2511 )
2512
2513 response_code = handle.getcode()
2514 if (response_code == 200):
2515 return self._parse_json(content, video_id)
2516 if (response_code == 413):
2517 return None
2518 raise ExtractorError('Unexpected HTTP error code: %s' % response_code)
2519
2520 first_continuation = True
2521 while continuations:
2522 continuation, itct = continuations.pop()
2523 comment_response = get_continuation(continuation, xsrf_token)
2524 if not comment_response:
2525 continue
2526 if list(search_dict(comment_response, 'externalErrorMessage')):
2527 raise ExtractorError('Error returned from server: ' + next(search_dict(comment_response, 'externalErrorMessage')))
2528
2529 if 'continuationContents' not in comment_response['response']:
2530 # Something is wrong here. Youtube won't accept this continuation token for some reason and responds with a user satisfaction dialog (error?)
2531 continue
2532 # not sure if this actually helps
2533 if 'xsrf_token' in comment_response:
2534 xsrf_token = comment_response['xsrf_token']
2535
2536 item_section = comment_response['response']['continuationContents']['itemSectionContinuation']
2537 if first_continuation:
2538 expected_video_comment_count = int(item_section['header']['commentsHeaderRenderer']['countText']['runs'][0]['text'].replace(' Comments', '').replace('1 Comment', '1').replace(',', ''))
2539 first_continuation = False
2540 if 'contents' not in item_section:
2541 # continuation returned no comments?
2542 # set an empty array as to not break the for loop
2543 item_section['contents'] = []
2544
2545 for meta_comment in item_section['contents']:
2546 comment = meta_comment['commentThreadRenderer']['comment']['commentRenderer']
2547 video_comments.append({
2548 'id': comment['commentId'],
2549 'text': ''.join([c['text'] for c in comment['contentText']['runs']]),
2550 'time_text': ''.join([c['text'] for c in comment['publishedTimeText']['runs']]),
2551 'author': comment.get('authorText', {}).get('simpleText', ''),
2552 'votes': comment.get('voteCount', {}).get('simpleText', '0'),
2553 'author_thumbnail': comment['authorThumbnail']['thumbnails'][-1]['url'],
2554 'parent': 'root'
2555 })
2556 if 'replies' not in meta_comment['commentThreadRenderer']:
2557 continue
2558
2559 reply_continuations = [rcn['nextContinuationData']['continuation'] for rcn in meta_comment['commentThreadRenderer']['replies']['commentRepliesRenderer']['continuations']]
2560 while reply_continuations:
2561 time.sleep(1)
2562 continuation = reply_continuations.pop()
2563 replies_data = get_continuation(continuation, xsrf_token, True)
2564 if not replies_data or 'continuationContents' not in replies_data[1]['response']:
2565 continue
2566
2567 if self._downloader.params.get('verbose', False):
2568 self.to_screen('[debug] Comments downloaded (chain %s) %s of ~%s' % (comment['commentId'], len(video_comments), expected_video_comment_count))
2569 reply_comment_meta = replies_data[1]['response']['continuationContents']['commentRepliesContinuation']
2570 for reply_meta in replies_data[1]['response']['continuationContents']['commentRepliesContinuation']['contents']:
2571 reply_comment = reply_meta['commentRenderer']
2572 video_comments.append({
2573 'id': reply_comment['commentId'],
2574 'text': ''.join([c['text'] for c in reply_comment['contentText']['runs']]),
2575 'time_text': ''.join([c['text'] for c in reply_comment['publishedTimeText']['runs']]),
2576 'author': reply_comment.get('authorText', {}).get('simpleText', ''),
2577 'votes': reply_comment.get('voteCount', {}).get('simpleText', '0'),
2578 'author_thumbnail': reply_comment['authorThumbnail']['thumbnails'][-1]['url'],
2579 'parent': comment['commentId']
2580 })
2581 if 'continuations' not in reply_comment_meta or len(reply_comment_meta['continuations']) == 0:
2582 continue
2583
2584 reply_continuations += [rcn['nextContinuationData']['continuation'] for rcn in reply_comment_meta['continuations']]
2585
2586 self.to_screen('Comments downloaded %s of ~%s' % (len(video_comments), expected_video_comment_count))
2587
2588 if 'continuations' in item_section:
2589 continuations += [ncd['nextContinuationData']['continuation'] for ncd in item_section['continuations']]
2590 time.sleep(1)
2591
2592 self.to_screen('Total comments downloaded %s of ~%s' % (len(video_comments), expected_video_comment_count))
2593 else:
2594 expected_video_comment_count = None
2595 video_comments = None
2596
2597 # Look for the DASH manifest
2598 if self._downloader.params.get('youtube_include_dash_manifest', True):
2599 dash_mpd_fatal = True
2600 for mpd_url in dash_mpds:
2601 dash_formats = {}
2602 try:
2603 def decrypt_sig(mobj):
2604 s = mobj.group(1)
2605 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2606 return '/signature/%s' % dec_s
2607
2608 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2609
2610 for df in self._extract_mpd_formats(
2611 mpd_url, video_id, fatal=dash_mpd_fatal,
2612 formats_dict=self._formats):
2613 if not df.get('filesize'):
2614 df['filesize'] = _extract_filesize(df['url'])
2615 # Do not overwrite DASH format found in some previous DASH manifest
2616 if df['format_id'] not in dash_formats:
2617 dash_formats[df['format_id']] = df
2618 # Additional DASH manifests may end up in HTTP Error 403 therefore
2619 # allow them to fail without bug report message if we already have
2620 # some DASH manifest succeeded. This is temporary workaround to reduce
2621 # burst of bug reports until we figure out the reason and whether it
2622 # can be fixed at all.
2623 dash_mpd_fatal = False
2624 except (ExtractorError, KeyError) as e:
2625 self.report_warning(
2626 'Skipping DASH manifest: %r' % e, video_id)
2627 if dash_formats:
2628 # Remove the formats we found through non-DASH, they
2629 # contain less info and it can be wrong, because we use
2630 # fixed values (for example the resolution). See
2631 # https://github.com/ytdl-org/youtube-dl/issues/5774 for an
2632 # example.
2633 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
2634 formats.extend(dash_formats.values())
2635
2636 # Check for malformed aspect ratio
2637 stretched_m = re.search(
2638 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2639 video_webpage)
2640 if stretched_m:
2641 w = float(stretched_m.group('w'))
2642 h = float(stretched_m.group('h'))
2643 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2644 # We will only process correct ratios.
2645 if w > 0 and h > 0:
2646 ratio = w / h
2647 for f in formats:
2648 if f.get('vcodec') != 'none':
2649 f['stretched_ratio'] = ratio
2650
2651 if not formats:
2652 if 'reason' in video_info:
2653 if 'The uploader has not made this video available in your country.' in video_info['reason']:
2654 regions_allowed = self._html_search_meta(
2655 'regionsAllowed', video_webpage, default=None)
2656 countries = regions_allowed.split(',') if regions_allowed else None
2657 self.raise_geo_restricted(
2658 msg=video_info['reason'][0], countries=countries)
2659 reason = video_info['reason'][0]
2660 if 'Invalid parameters' in reason:
2661 unavailable_message = extract_unavailable_message()
2662 if unavailable_message:
2663 reason = unavailable_message
2664 raise ExtractorError(
2665 'YouTube said: %s' % reason,
2666 expected=True, video_id=video_id)
2667 if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']):
2668 raise ExtractorError('This video is DRM protected.', expected=True)
2669
2670 self._sort_formats(formats)
2671
2672 self.mark_watched(video_id, video_info, player_response)
2673
2674 return {
2675 'id': video_id,
2676 'uploader': video_uploader,
2677 'uploader_id': video_uploader_id,
2678 'uploader_url': video_uploader_url,
2679 'channel': video_uploader,
2680 'channel_id': channel_id,
2681 'channel_url': channel_url,
2682 'upload_date': upload_date,
2683 'license': video_license,
2684 'creator': video_creator or artist,
2685 'title': video_title,
2686 'alt_title': video_alt_title or track,
2687 'thumbnails': thumbnails,
2688 'description': video_description,
2689 'categories': video_categories,
2690 'tags': video_tags,
2691 'subtitles': video_subtitles,
2692 'automatic_captions': automatic_captions,
2693 'duration': video_duration,
2694 'age_limit': 18 if age_gate else 0,
2695 'annotations': video_annotations,
2696 'chapters': chapters,
2697 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
2698 'view_count': view_count,
2699 'like_count': like_count,
2700 'dislike_count': dislike_count,
2701 'average_rating': average_rating,
2702 'formats': formats,
2703 'is_live': is_live,
2704 'start_time': start_time,
2705 'end_time': end_time,
2706 'series': series,
2707 'season_number': season_number,
2708 'episode_number': episode_number,
2709 'track': track,
2710 'artist': artist,
2711 'album': album,
2712 'release_date': release_date,
2713 'release_year': release_year,
2714 'subscriber_count': subscriber_count,
2715 'playable_in_embed': playable_in_embed,
2716 'comments': video_comments,
2717 'comment_count': expected_video_comment_count,
2718 }
2719
2720
2721 class YoutubeTabIE(YoutubeBaseInfoExtractor):
2722 IE_DESC = 'YouTube.com tab'
2723 _VALID_URL = r'''(?x)
2724 https?://
2725 (?:\w+\.)?
2726 (?:
2727 youtube(?:kids)?\.com|
2728 invidio\.us
2729 )/
2730 (?:
2731 (?:channel|c|user)/|
2732 (?P<not_channel>
2733 feed/|
2734 (?:playlist|watch)\?.*?\blist=
2735 )|
2736 (?!(?:%s)\b) # Direct URLs
2737 )
2738 (?P<id>[^/?\#&]+)
2739 ''' % YoutubeBaseInfoExtractor._RESERVED_NAMES
2740 IE_NAME = 'youtube:tab'
2741
2742 _TESTS = [{
2743 # playlists, multipage
2744 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid',
2745 'playlist_mincount': 94,
2746 'info_dict': {
2747 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2748 'title': 'Игорь Клейнер - Playlists',
2749 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
2750 },
2751 }, {
2752 # playlists, multipage, different order
2753 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
2754 'playlist_mincount': 94,
2755 'info_dict': {
2756 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2757 'title': 'Игорь Клейнер - Playlists',
2758 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
2759 },
2760 }, {
2761 # playlists, singlepage
2762 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
2763 'playlist_mincount': 4,
2764 'info_dict': {
2765 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
2766 'title': 'ThirstForScience - Playlists',
2767 'description': 'md5:609399d937ea957b0f53cbffb747a14c',
2768 }
2769 }, {
2770 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
2771 'only_matching': True,
2772 }, {
2773 # basic, single video playlist
2774 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2775 'info_dict': {
2776 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2777 'uploader': 'Sergey M.',
2778 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2779 'title': 'youtube-dl public playlist',
2780 },
2781 'playlist_count': 1,
2782 }, {
2783 # empty playlist
2784 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2785 'info_dict': {
2786 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2787 'uploader': 'Sergey M.',
2788 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2789 'title': 'youtube-dl empty playlist',
2790 },
2791 'playlist_count': 0,
2792 }, {
2793 # Home tab
2794 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured',
2795 'info_dict': {
2796 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2797 'title': 'lex will - Home',
2798 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2799 },
2800 'playlist_mincount': 2,
2801 }, {
2802 # Videos tab
2803 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos',
2804 'info_dict': {
2805 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2806 'title': 'lex will - Videos',
2807 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2808 },
2809 'playlist_mincount': 975,
2810 }, {
2811 # Videos tab, sorted by popular
2812 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid',
2813 'info_dict': {
2814 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2815 'title': 'lex will - Videos',
2816 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2817 },
2818 'playlist_mincount': 199,
2819 }, {
2820 # Playlists tab
2821 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists',
2822 'info_dict': {
2823 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2824 'title': 'lex will - Playlists',
2825 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2826 },
2827 'playlist_mincount': 17,
2828 }, {
2829 # Community tab
2830 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community',
2831 'info_dict': {
2832 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2833 'title': 'lex will - Community',
2834 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2835 },
2836 'playlist_mincount': 18,
2837 }, {
2838 # Channels tab
2839 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels',
2840 'info_dict': {
2841 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2842 'title': 'lex will - Channels',
2843 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2844 },
2845 'playlist_mincount': 138,
2846 }, {
2847 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA',
2848 'only_matching': True,
2849 }, {
2850 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
2851 'only_matching': True,
2852 }, {
2853 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
2854 'only_matching': True,
2855 }, {
2856 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2857 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2858 'info_dict': {
2859 'title': '29C3: Not my department',
2860 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2861 'uploader': 'Christiaan008',
2862 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
2863 },
2864 'playlist_count': 96,
2865 }, {
2866 'note': 'Large playlist',
2867 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
2868 'info_dict': {
2869 'title': 'Uploads from Cauchemar',
2870 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
2871 'uploader': 'Cauchemar',
2872 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
2873 },
2874 'playlist_mincount': 1123,
2875 }, {
2876 # even larger playlist, 8832 videos
2877 'url': 'http://www.youtube.com/user/NASAgovVideo/videos',
2878 'only_matching': True,
2879 }, {
2880 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2881 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2882 'info_dict': {
2883 'title': 'Uploads from Interstellar Movie',
2884 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
2885 'uploader': 'Interstellar Movie',
2886 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
2887 },
2888 'playlist_mincount': 21,
2889 }, {
2890 # https://github.com/ytdl-org/youtube-dl/issues/21844
2891 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2892 'info_dict': {
2893 'title': 'Data Analysis with Dr Mike Pound',
2894 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2895 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
2896 'uploader': 'Computerphile',
2897 },
2898 'playlist_mincount': 11,
2899 }, {
2900 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2901 'only_matching': True,
2902 }, {
2903 # Playlist URL that does not actually serve a playlist
2904 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2905 'info_dict': {
2906 'id': 'FqZTN594JQw',
2907 'ext': 'webm',
2908 'title': "Smiley's People 01 detective, Adventure Series, Action",
2909 'uploader': 'STREEM',
2910 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
2911 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
2912 'upload_date': '20150526',
2913 'license': 'Standard YouTube License',
2914 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2915 'categories': ['People & Blogs'],
2916 'tags': list,
2917 'view_count': int,
2918 'like_count': int,
2919 'dislike_count': int,
2920 },
2921 'params': {
2922 'skip_download': True,
2923 },
2924 'skip': 'This video is not available.',
2925 'add_ie': [YoutubeIE.ie_key()],
2926 }, {
2927 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
2928 'only_matching': True,
2929 }, {
2930 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
2931 'only_matching': True,
2932 }, {
2933 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live',
2934 'info_dict': {
2935 'id': '9Auq9mYxFEE',
2936 'ext': 'mp4',
2937 'title': 'Watch Sky News live',
2938 'uploader': 'Sky News',
2939 'uploader_id': 'skynews',
2940 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews',
2941 'upload_date': '20191102',
2942 'description': 'md5:78de4e1c2359d0ea3ed829678e38b662',
2943 'categories': ['News & Politics'],
2944 'tags': list,
2945 'like_count': int,
2946 'dislike_count': int,
2947 },
2948 'params': {
2949 'skip_download': True,
2950 },
2951 }, {
2952 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
2953 'info_dict': {
2954 'id': 'a48o2S1cPoo',
2955 'ext': 'mp4',
2956 'title': 'The Young Turks - Live Main Show',
2957 'uploader': 'The Young Turks',
2958 'uploader_id': 'TheYoungTurks',
2959 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
2960 'upload_date': '20150715',
2961 'license': 'Standard YouTube License',
2962 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
2963 'categories': ['News & Politics'],
2964 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
2965 'like_count': int,
2966 'dislike_count': int,
2967 },
2968 'params': {
2969 'skip_download': True,
2970 },
2971 'only_matching': True,
2972 }, {
2973 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
2974 'only_matching': True,
2975 }, {
2976 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
2977 'only_matching': True,
2978 }, {
2979 'url': 'https://www.youtube.com/feed/trending',
2980 'only_matching': True,
2981 }, {
2982 # needs auth
2983 'url': 'https://www.youtube.com/feed/library',
2984 'only_matching': True,
2985 }, {
2986 # needs auth
2987 'url': 'https://www.youtube.com/feed/history',
2988 'only_matching': True,
2989 }, {
2990 # needs auth
2991 'url': 'https://www.youtube.com/feed/subscriptions',
2992 'only_matching': True,
2993 }, {
2994 # needs auth
2995 'url': 'https://www.youtube.com/feed/watch_later',
2996 'only_matching': True,
2997 }, {
2998 # no longer available?
2999 'url': 'https://www.youtube.com/feed/recommended',
3000 'only_matching': True,
3001 }, {
3002 # inline playlist with not always working continuations
3003 'url': 'https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C',
3004 'only_matching': True,
3005 }, {
3006 'url': 'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8',
3007 'only_matching': True,
3008 }, {
3009 'url': 'https://www.youtube.com/course',
3010 'only_matching': True,
3011 }, {
3012 'url': 'https://www.youtube.com/zsecurity',
3013 'only_matching': True,
3014 }, {
3015 'url': 'http://www.youtube.com/NASAgovVideo/videos',
3016 'only_matching': True,
3017 }, {
3018 'url': 'https://www.youtube.com/TheYoungTurks/live',
3019 'only_matching': True,
3020 }]
3021
3022 @classmethod
3023 def suitable(cls, url):
3024 return False if YoutubeIE.suitable(url) else super(
3025 YoutubeTabIE, cls).suitable(url)
3026
3027 def _extract_channel_id(self, webpage):
3028 channel_id = self._html_search_meta(
3029 'channelId', webpage, 'channel id', default=None)
3030 if channel_id:
3031 return channel_id
3032 channel_url = self._html_search_meta(
3033 ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url',
3034 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad',
3035 'twitter:app:url:googleplay'), webpage, 'channel url')
3036 return self._search_regex(
3037 r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+',
3038 channel_url, 'channel id')
3039
3040 @staticmethod
3041 def _extract_grid_item_renderer(item):
3042 for item_kind in ('Playlist', 'Video', 'Channel'):
3043 renderer = item.get('grid%sRenderer' % item_kind)
3044 if renderer:
3045 return renderer
3046
3047 def _grid_entries(self, grid_renderer):
3048 for item in grid_renderer['items']:
3049 if not isinstance(item, dict):
3050 continue
3051 renderer = self._extract_grid_item_renderer(item)
3052 if not isinstance(renderer, dict):
3053 continue
3054 title = try_get(
3055 renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
3056 # playlist
3057 playlist_id = renderer.get('playlistId')
3058 if playlist_id:
3059 yield self.url_result(
3060 'https://www.youtube.com/playlist?list=%s' % playlist_id,
3061 ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
3062 video_title=title)
3063 # video
3064 video_id = renderer.get('videoId')
3065 if video_id:
3066 yield self._extract_video(renderer)
3067 # channel
3068 channel_id = renderer.get('channelId')
3069 if channel_id:
3070 title = try_get(
3071 renderer, lambda x: x['title']['simpleText'], compat_str)
3072 yield self.url_result(
3073 'https://www.youtube.com/channel/%s' % channel_id,
3074 ie=YoutubeTabIE.ie_key(), video_title=title)
3075
3076 def _shelf_entries_from_content(self, shelf_renderer):
3077 content = shelf_renderer.get('content')
3078 if not isinstance(content, dict):
3079 return
3080 renderer = content.get('gridRenderer')
3081 if renderer:
3082 # TODO: add support for nested playlists so each shelf is processed
3083 # as separate playlist
3084 # TODO: this includes only first N items
3085 for entry in self._grid_entries(renderer):
3086 yield entry
3087 renderer = content.get('horizontalListRenderer')
3088 if renderer:
3089 # TODO
3090 pass
3091
3092 def _shelf_entries(self, shelf_renderer, skip_channels=False):
3093 ep = try_get(
3094 shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
3095 compat_str)
3096 shelf_url = urljoin('https://www.youtube.com', ep)
3097 if shelf_url:
3098 # Skipping links to another channels, note that checking for
3099 # endpoint.commandMetadata.webCommandMetadata.webPageTypwebPageType == WEB_PAGE_TYPE_CHANNEL
3100 # will not work
3101 if skip_channels and '/channels?' in shelf_url:
3102 return
3103 title = try_get(
3104 shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
3105 yield self.url_result(shelf_url, video_title=title)
3106 # Shelf may not contain shelf URL, fallback to extraction from content
3107 for entry in self._shelf_entries_from_content(shelf_renderer):
3108 yield entry
3109
3110 def _playlist_entries(self, video_list_renderer):
3111 for content in video_list_renderer['contents']:
3112 if not isinstance(content, dict):
3113 continue
3114 renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer')
3115 if not isinstance(renderer, dict):
3116 continue
3117 video_id = renderer.get('videoId')
3118 if not video_id:
3119 continue
3120 yield self._extract_video(renderer)
3121
3122 r""" # Not needed in the new implementation
3123 def _itemSection_entries(self, item_sect_renderer):
3124 for content in item_sect_renderer['contents']:
3125 if not isinstance(content, dict):
3126 continue
3127 renderer = content.get('videoRenderer', {})
3128 if not isinstance(renderer, dict):
3129 continue
3130 video_id = renderer.get('videoId')
3131 if not video_id:
3132 continue
3133 yield self._extract_video(renderer)
3134 """
3135
3136 def _rich_entries(self, rich_grid_renderer):
3137 renderer = try_get(
3138 rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) or {}
3139 video_id = renderer.get('videoId')
3140 if not video_id:
3141 return
3142 yield self._extract_video(renderer)
3143
3144 def _video_entry(self, video_renderer):
3145 video_id = video_renderer.get('videoId')
3146 if video_id:
3147 return self._extract_video(video_renderer)
3148
3149 def _post_thread_entries(self, post_thread_renderer):
3150 post_renderer = try_get(
3151 post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict)
3152 if not post_renderer:
3153 return
3154 # video attachment
3155 video_renderer = try_get(
3156 post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict)
3157 video_id = None
3158 if video_renderer:
3159 entry = self._video_entry(video_renderer)
3160 if entry:
3161 yield entry
3162 # inline video links
3163 runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or []
3164 for run in runs:
3165 if not isinstance(run, dict):
3166 continue
3167 ep_url = try_get(
3168 run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str)
3169 if not ep_url:
3170 continue
3171 if not YoutubeIE.suitable(ep_url):
3172 continue
3173 ep_video_id = YoutubeIE._match_id(ep_url)
3174 if video_id == ep_video_id:
3175 continue
3176 yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=video_id)
3177
3178 def _post_thread_continuation_entries(self, post_thread_continuation):
3179 contents = post_thread_continuation.get('contents')
3180 if not isinstance(contents, list):
3181 return
3182 for content in contents:
3183 renderer = content.get('backstagePostThreadRenderer')
3184 if not isinstance(renderer, dict):
3185 continue
3186 for entry in self._post_thread_entries(renderer):
3187 yield entry
3188
3189 @staticmethod
3190 def _build_continuation_query(continuation, ctp=None):
3191 query = {
3192 'ctoken': continuation,
3193 'continuation': continuation,
3194 }
3195 if ctp:
3196 query['itct'] = ctp
3197 return query
3198
3199 @staticmethod
3200 def _extract_next_continuation_data(renderer):
3201 next_continuation = try_get(
3202 renderer, lambda x: x['continuations'][0]['nextContinuationData'], dict)
3203 if not next_continuation:
3204 return
3205 continuation = next_continuation.get('continuation')
3206 if not continuation:
3207 return
3208 ctp = next_continuation.get('clickTrackingParams')
3209 return YoutubeTabIE._build_continuation_query(continuation, ctp)
3210
3211 @classmethod
3212 def _extract_continuation(cls, renderer):
3213 next_continuation = cls._extract_next_continuation_data(renderer)
3214 if next_continuation:
3215 return next_continuation
3216 contents = renderer.get('contents')
3217 if not isinstance(contents, list):
3218 return
3219 for content in contents:
3220 if not isinstance(content, dict):
3221 continue
3222 continuation_ep = try_get(
3223 content, lambda x: x['continuationItemRenderer']['continuationEndpoint'],
3224 dict)
3225 if not continuation_ep:
3226 continue
3227 continuation = try_get(
3228 continuation_ep, lambda x: x['continuationCommand']['token'], compat_str)
3229 if not continuation:
3230 continue
3231 ctp = continuation_ep.get('clickTrackingParams')
3232 return YoutubeTabIE._build_continuation_query(continuation, ctp)
3233
3234 def _entries(self, tab, identity_token):
3235
3236 def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds
3237 contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
3238 for content in contents:
3239 if not isinstance(content, dict):
3240 continue
3241 is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict)
3242 if not is_renderer:
3243 renderer = content.get('richItemRenderer')
3244 if renderer:
3245 for entry in self._rich_entries(renderer):
3246 yield entry
3247 continuation_list[0] = self._extract_continuation(parent_renderer)
3248 continue
3249 isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
3250 for isr_content in isr_contents:
3251 if not isinstance(isr_content, dict):
3252 continue
3253 renderer = isr_content.get('playlistVideoListRenderer')
3254 if renderer:
3255 for entry in self._playlist_entries(renderer):
3256 yield entry
3257 continuation_list[0] = self._extract_continuation(renderer)
3258 continue
3259 renderer = isr_content.get('gridRenderer')
3260 if renderer:
3261 for entry in self._grid_entries(renderer):
3262 yield entry
3263 continuation_list[0] = self._extract_continuation(renderer)
3264 continue
3265 renderer = isr_content.get('shelfRenderer')
3266 if renderer:
3267 is_channels_tab = tab.get('title') == 'Channels'
3268 for entry in self._shelf_entries(renderer, not is_channels_tab):
3269 yield entry
3270 continue
3271 renderer = isr_content.get('backstagePostThreadRenderer')
3272 if renderer:
3273 for entry in self._post_thread_entries(renderer):
3274 yield entry
3275 continuation_list[0] = self._extract_continuation(renderer)
3276 continue
3277 renderer = isr_content.get('videoRenderer')
3278 if renderer:
3279 entry = self._video_entry(renderer)
3280 if entry:
3281 yield entry
3282
3283 if not continuation_list[0]:
3284 continuation_list[0] = self._extract_continuation(is_renderer)
3285
3286 if not continuation_list[0]:
3287 continuation_list[0] = self._extract_continuation(parent_renderer)
3288
3289 continuation_list = [None] # Python 2 doesnot support nonlocal
3290 tab_content = try_get(tab, lambda x: x['content'], dict)
3291 if not tab_content:
3292 return
3293 parent_renderer = (
3294 try_get(tab_content, lambda x: x['sectionListRenderer'], dict)
3295 or try_get(tab_content, lambda x: x['richGridRenderer'], dict) or {})
3296 for entry in extract_entries(parent_renderer):
3297 yield entry
3298 continuation = continuation_list[0]
3299
3300 headers = {
3301 'x-youtube-client-name': '1',
3302 'x-youtube-client-version': '2.20201112.04.01',
3303 }
3304 if identity_token:
3305 headers['x-youtube-identity-token'] = identity_token
3306
3307 for page_num in itertools.count(1):
3308 if not continuation:
3309 break
3310 count = 0
3311 retries = 3
3312 while count <= retries:
3313 try:
3314 # Downloading page may result in intermittent 5xx HTTP error
3315 # that is usually worked around with a retry
3316 browse = self._download_json(
3317 'https://www.youtube.com/browse_ajax', None,
3318 'Downloading page %d%s'
3319 % (page_num, ' (retry #%d)' % count if count else ''),
3320 headers=headers, query=continuation)
3321 break
3322 except ExtractorError as e:
3323 if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
3324 count += 1
3325 if count <= retries:
3326 continue
3327 raise
3328 if not browse:
3329 break
3330 response = try_get(browse, lambda x: x[1]['response'], dict)
3331 if not response:
3332 break
3333
3334 continuation_contents = try_get(
3335 response, lambda x: x['continuationContents'], dict)
3336 if continuation_contents:
3337 continuation_renderer = continuation_contents.get('playlistVideoListContinuation')
3338 if continuation_renderer:
3339 for entry in self._playlist_entries(continuation_renderer):
3340 yield entry
3341 continuation = self._extract_continuation(continuation_renderer)
3342 continue
3343 continuation_renderer = continuation_contents.get('gridContinuation')
3344 if continuation_renderer:
3345 for entry in self._grid_entries(continuation_renderer):
3346 yield entry
3347 continuation = self._extract_continuation(continuation_renderer)
3348 continue
3349 continuation_renderer = continuation_contents.get('itemSectionContinuation')
3350 if continuation_renderer:
3351 for entry in self._post_thread_continuation_entries(continuation_renderer):
3352 yield entry
3353 continuation = self._extract_continuation(continuation_renderer)
3354 continue
3355 continuation_renderer = continuation_contents.get('sectionListContinuation') # for feeds
3356 if continuation_renderer:
3357 continuation_list = [None]
3358 for entry in extract_entries(continuation_renderer):
3359 yield entry
3360 continuation = continuation_list[0]
3361 continue
3362
3363 continuation_items = try_get(
3364 response, lambda x: x['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems'], list)
3365 if continuation_items:
3366 continuation_item = continuation_items[0]
3367 if not isinstance(continuation_item, dict):
3368 continue
3369 renderer = continuation_item.get('playlistVideoRenderer') or continuation_item.get('itemSectionRenderer')
3370 if renderer:
3371 video_list_renderer = {'contents': continuation_items}
3372 for entry in self._playlist_entries(video_list_renderer):
3373 yield entry
3374 continuation = self._extract_continuation(video_list_renderer)
3375 continue
3376 break
3377
3378 @staticmethod
3379 def _extract_selected_tab(tabs):
3380 for tab in tabs:
3381 if try_get(tab, lambda x: x['tabRenderer']['selected'], bool):
3382 return tab['tabRenderer']
3383 else:
3384 raise ExtractorError('Unable to find selected tab')
3385
3386 @staticmethod
3387 def _extract_uploader(data):
3388 uploader = {}
3389 sidebar_renderer = try_get(
3390 data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list)
3391 if sidebar_renderer:
3392 for item in sidebar_renderer:
3393 if not isinstance(item, dict):
3394 continue
3395 renderer = item.get('playlistSidebarSecondaryInfoRenderer')
3396 if not isinstance(renderer, dict):
3397 continue
3398 owner = try_get(
3399 renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict)
3400 if owner:
3401 uploader['uploader'] = owner.get('text')
3402 uploader['uploader_id'] = try_get(
3403 owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str)
3404 uploader['uploader_url'] = urljoin(
3405 'https://www.youtube.com/',
3406 try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
3407 return {k: v for k, v in uploader.items() if v is not None}
3408
3409 def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
3410 playlist_id = title = description = channel_url = channel_name = channel_id = None
3411 thumbnails_list = tags = []
3412
3413 selected_tab = self._extract_selected_tab(tabs)
3414 renderer = try_get(
3415 data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
3416 if renderer:
3417 channel_name = renderer.get('title')
3418 channel_url = renderer.get('channelUrl')
3419 channel_id = renderer.get('externalId')
3420
3421 if not renderer:
3422 renderer = try_get(
3423 data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
3424 if renderer:
3425 title = renderer.get('title')
3426 description = renderer.get('description')
3427 playlist_id = channel_id
3428 tags = renderer.get('keywords', '').split()
3429 thumbnails_list = (
3430 try_get(renderer, lambda x: x['avatar']['thumbnails'], list)
3431 or data['sidebar']['playlistSidebarRenderer']['items'][0]['playlistSidebarPrimaryInfoRenderer']['thumbnailRenderer']['playlistVideoThumbnailRenderer']['thumbnail']['thumbnails']
3432 or [])
3433
3434 thumbnails = []
3435 for t in thumbnails_list:
3436 if not isinstance(t, dict):
3437 continue
3438 thumbnail_url = url_or_none(t.get('url'))
3439 if not thumbnail_url:
3440 continue
3441 thumbnails.append({
3442 'url': thumbnail_url,
3443 'width': int_or_none(t.get('width')),
3444 'height': int_or_none(t.get('height')),
3445 })
3446
3447 if playlist_id is None:
3448 playlist_id = item_id
3449 if title is None:
3450 title = playlist_id
3451 title += format_field(selected_tab, 'title', ' - %s')
3452
3453 metadata = {
3454 'playlist_id': playlist_id,
3455 'playlist_title': title,
3456 'playlist_description': description,
3457 'uploader': channel_name,
3458 'uploader_id': channel_id,
3459 'uploader_url': channel_url,
3460 'thumbnails': thumbnails,
3461 'tags': tags,
3462 }
3463 if not channel_id:
3464 metadata.update(self._extract_uploader(data))
3465 metadata.update({
3466 'channel': metadata['uploader'],
3467 'channel_id': metadata['uploader_id'],
3468 'channel_url': metadata['uploader_url']})
3469 return self.playlist_result(
3470 self._entries(selected_tab, identity_token),
3471 **metadata)
3472
3473 def _extract_from_playlist(self, item_id, url, data, playlist):
3474 title = playlist.get('title') or try_get(
3475 data, lambda x: x['titleText']['simpleText'], compat_str)
3476 playlist_id = playlist.get('playlistId') or item_id
3477 # Inline playlist rendition continuation does not always work
3478 # at Youtube side, so delegating regular tab-based playlist URL
3479 # processing whenever possible.
3480 playlist_url = urljoin(url, try_get(
3481 playlist, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
3482 compat_str))
3483 if playlist_url and playlist_url != url:
3484 return self.url_result(
3485 playlist_url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
3486 video_title=title)
3487 return self.playlist_result(
3488 self._playlist_entries(playlist), playlist_id=playlist_id,
3489 playlist_title=title)
3490
3491 @staticmethod
3492 def _extract_alerts(data):
3493 for alert_dict in try_get(data, lambda x: x['alerts'], list) or []:
3494 if not isinstance(alert_dict, dict):
3495 continue
3496 for renderer in alert_dict:
3497 alert = alert_dict[renderer]
3498 alert_type = alert.get('type')
3499 if not alert_type:
3500 continue
3501 message = try_get(alert, lambda x: x['text']['simpleText'], compat_str)
3502 if message:
3503 yield alert_type, message
3504 for run in try_get(alert, lambda x: x['text']['runs'], list) or []:
3505 message = try_get(run, lambda x: x['text'], compat_str)
3506 if message:
3507 yield alert_type, message
3508
3509 def _extract_identity_token(self, webpage, item_id):
3510 ytcfg = self._extract_ytcfg(item_id, webpage)
3511 if ytcfg:
3512 token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str)
3513 if token:
3514 return token
3515 return self._search_regex(
3516 r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
3517 'identity token', default=None)
3518
3519 def _real_extract(self, url):
3520 item_id = self._match_id(url)
3521 url = compat_urlparse.urlunparse(
3522 compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
3523 is_home = re.match(r'(?P<pre>%s)(?P<post>/?(?![^#?]).*$)' % self._VALID_URL, url)
3524 if is_home is not None and is_home.group('not_channel') is None and item_id != 'feed':
3525 self._downloader.report_warning(
3526 'A channel/user page was given. All the channel\'s videos will be downloaded. '
3527 'To download only the videos in the home page, add a "/featured" to the URL')
3528 url = '%s/videos%s' % (is_home.group('pre'), is_home.group('post') or '')
3529
3530 # Handle both video/playlist URLs
3531 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3532 video_id = qs.get('v', [None])[0]
3533 playlist_id = qs.get('list', [None])[0]
3534
3535 if is_home is not None and is_home.group('not_channel') is not None and is_home.group('not_channel').startswith('watch') and not video_id:
3536 if playlist_id:
3537 self._downloader.report_warning('%s is not a valid Youtube URL. Trying to download playlist %s' % (url, playlist_id))
3538 url = 'https://www.youtube.com/playlist?list=%s' % playlist_id
3539 # return self.url_result(playlist_id, ie=YoutubePlaylistIE.ie_key())
3540 else:
3541 raise ExtractorError('Unable to recognize tab page')
3542 if video_id and playlist_id:
3543 if self._downloader.params.get('noplaylist'):
3544 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
3545 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
3546 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
3547
3548 webpage = self._download_webpage(url, item_id)
3549 identity_token = self._extract_identity_token(webpage, item_id)
3550 data = self._extract_yt_initial_data(item_id, webpage)
3551 err_msg = None
3552 for alert_type, alert_message in self._extract_alerts(data):
3553 if alert_type.lower() == 'error':
3554 if err_msg:
3555 self._downloader.report_warning('YouTube said: %s - %s' % ('ERROR', err_msg))
3556 err_msg = alert_message
3557 else:
3558 self._downloader.report_warning('YouTube said: %s - %s' % (alert_type, alert_message))
3559 if err_msg:
3560 raise ExtractorError('YouTube said: %s' % err_msg, expected=True)
3561 tabs = try_get(
3562 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
3563 if tabs:
3564 return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token)
3565 playlist = try_get(
3566 data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
3567 if playlist:
3568 return self._extract_from_playlist(item_id, url, data, playlist)
3569 # Fallback to video extraction if no playlist alike page is recognized.
3570 # First check for the current video then try the v attribute of URL query.
3571 video_id = try_get(
3572 data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'],
3573 compat_str) or video_id
3574 if video_id:
3575 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
3576 # Failed to recognize
3577 raise ExtractorError('Unable to recognize tab page')
3578
3579
3580 class YoutubePlaylistIE(InfoExtractor):
3581 IE_DESC = 'YouTube.com playlists'
3582 _VALID_URL = r'''(?x)(?:
3583 (?:https?://)?
3584 (?:\w+\.)?
3585 (?:
3586 (?:
3587 youtube(?:kids)?\.com|
3588 invidio\.us
3589 )
3590 /.*?\?.*?\blist=
3591 )?
3592 (?P<id>%(playlist_id)s)
3593 )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
3594 IE_NAME = 'youtube:playlist'
3595 _TESTS = [{
3596 'note': 'issue #673',
3597 'url': 'PLBB231211A4F62143',
3598 'info_dict': {
3599 'title': '[OLD]Team Fortress 2 (Class-based LP)',
3600 'id': 'PLBB231211A4F62143',
3601 'uploader': 'Wickydoo',
3602 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
3603 },
3604 'playlist_mincount': 29,
3605 }, {
3606 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
3607 'info_dict': {
3608 'title': 'YDL_safe_search',
3609 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
3610 },
3611 'playlist_count': 2,
3612 'skip': 'This playlist is private',
3613 }, {
3614 'note': 'embedded',
3615 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
3616 'playlist_count': 4,
3617 'info_dict': {
3618 'title': 'JODA15',
3619 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
3620 'uploader': 'milan',
3621 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
3622 }
3623 }, {
3624 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
3625 'playlist_mincount': 982,
3626 'info_dict': {
3627 'title': '2018 Chinese New Singles (11/6 updated)',
3628 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
3629 'uploader': 'LBK',
3630 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
3631 }
3632 }, {
3633 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
3634 'only_matching': True,
3635 }, {
3636 # music album playlist
3637 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
3638 'only_matching': True,
3639 }]
3640
3641 @classmethod
3642 def suitable(cls, url):
3643 return False if YoutubeTabIE.suitable(url) else super(
3644 YoutubePlaylistIE, cls).suitable(url)
3645
3646 def _real_extract(self, url):
3647 playlist_id = self._match_id(url)
3648 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3649 if not qs:
3650 qs = {'list': playlist_id}
3651 return self.url_result(
3652 update_url_query('https://www.youtube.com/playlist', qs),
3653 ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
3654
3655
3656 class YoutubeYtBeIE(InfoExtractor):
3657 IE_DESC = 'youtu.be'
3658 _VALID_URL = r'https?://youtu\.be/(?P<id>[0-9A-Za-z_-]{11})/*?.*?\blist=(?P<playlist_id>%(playlist_id)s)' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
3659 _TESTS = [{
3660 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
3661 'info_dict': {
3662 'id': 'yeWKywCrFtk',
3663 'ext': 'mp4',
3664 'title': 'Small Scale Baler and Braiding Rugs',
3665 'uploader': 'Backus-Page House Museum',
3666 'uploader_id': 'backuspagemuseum',
3667 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
3668 'upload_date': '20161008',
3669 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
3670 'categories': ['Nonprofits & Activism'],
3671 'tags': list,
3672 'like_count': int,
3673 'dislike_count': int,
3674 },
3675 'params': {
3676 'noplaylist': True,
3677 'skip_download': True,
3678 },
3679 }, {
3680 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
3681 'only_matching': True,
3682 }]
3683
3684 def _real_extract(self, url):
3685 mobj = re.match(self._VALID_URL, url)
3686 video_id = mobj.group('id')
3687 playlist_id = mobj.group('playlist_id')
3688 return self.url_result(
3689 update_url_query('https://www.youtube.com/watch', {
3690 'v': video_id,
3691 'list': playlist_id,
3692 'feature': 'youtu.be',
3693 }), ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
3694
3695
3696 class YoutubeYtUserIE(InfoExtractor):
3697 IE_DESC = 'YouTube.com user videos, URL or "ytuser" keyword'
3698 _VALID_URL = r'ytuser:(?P<id>.+)'
3699 _TESTS = [{
3700 'url': 'ytuser:phihag',
3701 'only_matching': True,
3702 }]
3703
3704 def _real_extract(self, url):
3705 user_id = self._match_id(url)
3706 return self.url_result(
3707 'https://www.youtube.com/user/%s' % user_id,
3708 ie=YoutubeTabIE.ie_key(), video_id=user_id)
3709
3710
3711 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
3712 IE_NAME = 'youtube:favorites'
3713 IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)'
3714 _VALID_URL = r':ytfav(?:ou?rite)?s?'
3715 _LOGIN_REQUIRED = True
3716 _TESTS = [{
3717 'url': ':ytfav',
3718 'only_matching': True,
3719 }, {
3720 'url': ':ytfavorites',
3721 'only_matching': True,
3722 }]
3723
3724 def _real_extract(self, url):
3725 return self.url_result(
3726 'https://www.youtube.com/playlist?list=LL',
3727 ie=YoutubeTabIE.ie_key())
3728
3729
3730 class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):
3731 IE_DESC = 'YouTube.com searches'
3732 # there doesn't appear to be a real limit, for example if you search for
3733 # 'python' you get more than 8.000.000 results
3734 _MAX_RESULTS = float('inf')
3735 IE_NAME = 'youtube:search'
3736 _SEARCH_KEY = 'ytsearch'
3737 _SEARCH_PARAMS = None
3738 _TESTS = []
3739
3740 def _entries(self, query, n):
3741 data = {
3742 'context': {
3743 'client': {
3744 'clientName': 'WEB',
3745 'clientVersion': '2.20201021.03.00',
3746 }
3747 },
3748 'query': query,
3749 }
3750 if self._SEARCH_PARAMS:
3751 data['params'] = self._SEARCH_PARAMS
3752 total = 0
3753 for page_num in itertools.count(1):
3754 search = self._download_json(
3755 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
3756 video_id='query "%s"' % query,
3757 note='Downloading page %s' % page_num,
3758 errnote='Unable to download API page', fatal=False,
3759 data=json.dumps(data).encode('utf8'),
3760 headers={'content-type': 'application/json'})
3761 if not search:
3762 break
3763 slr_contents = try_get(
3764 search,
3765 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
3766 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
3767 list)
3768 if not slr_contents:
3769 break
3770
3771 # Youtube sometimes adds promoted content to searches,
3772 # changing the index location of videos and token.
3773 # So we search through all entries till we find them.
3774 continuation_token = None
3775 for slr_content in slr_contents:
3776 isr_contents = try_get(
3777 slr_content,
3778 lambda x: x['itemSectionRenderer']['contents'],
3779 list)
3780 if not isr_contents:
3781 continue
3782 for content in isr_contents:
3783 if not isinstance(content, dict):
3784 continue
3785 video = content.get('videoRenderer')
3786 if not isinstance(video, dict):
3787 continue
3788 video_id = video.get('videoId')
3789 if not video_id:
3790 continue
3791
3792 yield self._extract_video(video)
3793 total += 1
3794 if total == n:
3795 return
3796
3797 if continuation_token is None:
3798 continuation_token = try_get(
3799 slr_content,
3800 lambda x: x['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
3801 compat_str)
3802
3803 if not continuation_token:
3804 break
3805 data['continuation'] = continuation_token
3806
3807 def _get_n_results(self, query, n):
3808 """Get a specified number of results for a query"""
3809 return self.playlist_result(self._entries(query, n), query)
3810
3811
3812 class YoutubeSearchDateIE(YoutubeSearchIE):
3813 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
3814 _SEARCH_KEY = 'ytsearchdate'
3815 IE_DESC = 'YouTube.com searches, newest videos first, "ytsearchdate" keyword'
3816 _SEARCH_PARAMS = 'CAI%3D'
3817
3818
3819 class YoutubeSearchURLIE(YoutubeSearchIE):
3820 IE_DESC = 'YouTube.com searches, "ytsearch" keyword'
3821 IE_NAME = YoutubeSearchIE.IE_NAME + '_url'
3822 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)'
3823 # _MAX_RESULTS = 100
3824 _TESTS = [{
3825 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
3826 'playlist_mincount': 5,
3827 'info_dict': {
3828 'title': 'youtube-dl test video',
3829 }
3830 }, {
3831 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
3832 'only_matching': True,
3833 }]
3834
3835 @classmethod
3836 def _make_valid_url(cls):
3837 return cls._VALID_URL
3838
3839 def _real_extract(self, url):
3840 qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
3841 query = (qs.get('search_query') or qs.get('q'))[0]
3842 self._SEARCH_PARAMS = qs.get('sp', ('',))[0]
3843 return self._get_n_results(query, self._MAX_RESULTS)
3844
3845
3846 class YoutubeFeedsInfoExtractor(YoutubeTabIE):
3847 """
3848 Base class for feed extractors
3849 Subclasses must define the _FEED_NAME property.
3850 """
3851 _LOGIN_REQUIRED = True
3852 # _MAX_PAGES = 5
3853 _TESTS = []
3854
3855 @property
3856 def IE_NAME(self):
3857 return 'youtube:%s' % self._FEED_NAME
3858
3859 def _real_initialize(self):
3860 self._login()
3861
3862 def _real_extract(self, url):
3863 return self.url_result(
3864 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
3865 ie=YoutubeTabIE.ie_key())
3866
3867
3868 class YoutubeWatchLaterIE(InfoExtractor):
3869 IE_NAME = 'youtube:watchlater'
3870 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
3871 _VALID_URL = r':ytwatchlater'
3872 _TESTS = [{
3873 'url': ':ytwatchlater',
3874 'only_matching': True,
3875 }]
3876
3877 def _real_extract(self, url):
3878 return self.url_result(
3879 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key())
3880
3881
3882 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3883 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
3884 _VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?'
3885 _FEED_NAME = 'recommended'
3886 _TESTS = [{
3887 'url': ':ytrec',
3888 'only_matching': True,
3889 }, {
3890 'url': ':ytrecommended',
3891 'only_matching': True,
3892 }, {
3893 'url': 'https://youtube.com',
3894 'only_matching': True,
3895 }]
3896
3897
3898 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
3899 IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)'
3900 _VALID_URL = r':ytsub(?:scription)?s?'
3901 _FEED_NAME = 'subscriptions'
3902 _TESTS = [{
3903 'url': ':ytsubs',
3904 'only_matching': True,
3905 }, {
3906 'url': ':ytsubscriptions',
3907 'only_matching': True,
3908 }]
3909
3910
3911 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
3912 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
3913 _VALID_URL = r':ythistory'
3914 _FEED_NAME = 'history'
3915 _TESTS = [{
3916 'url': ':ythistory',
3917 'only_matching': True,
3918 }]
3919
3920
3921 class YoutubeTruncatedURLIE(InfoExtractor):
3922 IE_NAME = 'youtube:truncated_url'
3923 IE_DESC = False # Do not list
3924 _VALID_URL = r'''(?x)
3925 (?:https?://)?
3926 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3927 (?:watch\?(?:
3928 feature=[a-z_]+|
3929 annotation_id=annotation_[^&]+|
3930 x-yt-cl=[0-9]+|
3931 hl=[^&]*|
3932 t=[0-9]+
3933 )?
3934 |
3935 attribution_link\?a=[^&]+
3936 )
3937 $
3938 '''
3939
3940 _TESTS = [{
3941 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
3942 'only_matching': True,
3943 }, {
3944 'url': 'https://www.youtube.com/watch?',
3945 'only_matching': True,
3946 }, {
3947 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3948 'only_matching': True,
3949 }, {
3950 'url': 'https://www.youtube.com/watch?feature=foo',
3951 'only_matching': True,
3952 }, {
3953 'url': 'https://www.youtube.com/watch?hl=en-GB',
3954 'only_matching': True,
3955 }, {
3956 'url': 'https://www.youtube.com/watch?t=2372',
3957 'only_matching': True,
3958 }]
3959
3960 def _real_extract(self, url):
3961 raise ExtractorError(
3962 'Did you forget to quote the URL? Remember that & is a meta '
3963 'character in most shells, so you want to put the URL in quotes, '
3964 'like youtube-dl '
3965 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3966 ' or simply youtube-dl BaW_jenozKc .',
3967 expected=True)
3968
3969
3970 class YoutubeTruncatedIDIE(InfoExtractor):
3971 IE_NAME = 'youtube:truncated_id'
3972 IE_DESC = False # Do not list
3973 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
3974
3975 _TESTS = [{
3976 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3977 'only_matching': True,
3978 }]
3979
3980 def _real_extract(self, url):
3981 video_id = self._match_id(url)
3982 raise ExtractorError(
3983 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3984 expected=True)
3985
3986
3987 # Do Youtube show urls even exist anymore? I couldn't find any
3988 r'''
3989 class YoutubeShowIE(YoutubeTabIE):
3990 IE_DESC = 'YouTube.com (multi-season) shows'
3991 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
3992 IE_NAME = 'youtube:show'
3993 _TESTS = [{
3994 'url': 'https://www.youtube.com/show/airdisasters',
3995 'playlist_mincount': 5,
3996 'info_dict': {
3997 'id': 'airdisasters',
3998 'title': 'Air Disasters',
3999 }
4000 }]
4001
4002 def _real_extract(self, url):
4003 playlist_id = self._match_id(url)
4004 return super(YoutubeShowIE, self)._real_extract(
4005 'https://www.youtube.com/show/%s/playlists' % playlist_id)
4006 '''