]> jfr.im git - yt-dlp.git/blob - youtube_dlc/extractor/youtube.py
f9e60f03ef42b430b78f4d9eb70e968900690939
[yt-dlp.git] / youtube_dlc / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5
6 import itertools
7 import json
8 import os.path
9 import random
10 import re
11 import time
12 import traceback
13
14 from .common import InfoExtractor, SearchInfoExtractor
15 from ..jsinterp import JSInterpreter
16 from ..swfinterp import SWFInterpreter
17 from ..compat import (
18 compat_chr,
19 compat_HTTPError,
20 compat_kwargs,
21 compat_parse_qs,
22 compat_urllib_parse_unquote,
23 compat_urllib_parse_unquote_plus,
24 compat_urllib_parse_urlencode,
25 compat_urllib_parse_urlparse,
26 compat_urlparse,
27 compat_str,
28 )
29 from ..utils import (
30 bool_or_none,
31 clean_html,
32 error_to_compat_str,
33 ExtractorError,
34 float_or_none,
35 get_element_by_id,
36 int_or_none,
37 mimetype2ext,
38 parse_codecs,
39 parse_count,
40 parse_duration,
41 remove_quotes,
42 remove_start,
43 smuggle_url,
44 str_or_none,
45 str_to_int,
46 try_get,
47 unescapeHTML,
48 unified_strdate,
49 unsmuggle_url,
50 update_url_query,
51 uppercase_escape,
52 url_or_none,
53 urlencode_postdata,
54 urljoin,
55 )
56
57
58 class YoutubeBaseInfoExtractor(InfoExtractor):
59 """Provide base functions for Youtube extractors"""
60 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
61 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
62
63 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
64 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
65 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
66
67 _RESERVED_NAMES = (
68 r'embed|e|watch_popup|channel|c|user|playlist|watch|w|v|movies|results|shared|'
69 r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout|'
70 r'feed/(?:watch_later|history|subscriptions|library|trending|recommended)')
71
72 _NETRC_MACHINE = 'youtube'
73 # If True it will raise an error if no login info is provided
74 _LOGIN_REQUIRED = False
75
76 _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)'
77
78 def _set_language(self):
79 self._set_cookie(
80 '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
81 # YouTube sets the expire time to about two months
82 expire_time=time.time() + 2 * 30 * 24 * 3600)
83
84 def _ids_to_results(self, ids):
85 return [
86 self.url_result(vid_id, 'Youtube', video_id=vid_id)
87 for vid_id in ids]
88
89 def _login(self):
90 """
91 Attempt to log in to YouTube.
92 True is returned if successful or skipped.
93 False is returned if login failed.
94
95 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
96 """
97 username, password = self._get_login_info()
98 # No authentication to be performed
99 if username is None:
100 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
101 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
102 if self._downloader.params.get('cookiefile') and False: # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
103 self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
104 return True
105
106 login_page = self._download_webpage(
107 self._LOGIN_URL, None,
108 note='Downloading login page',
109 errnote='unable to fetch login page', fatal=False)
110 if login_page is False:
111 return
112
113 login_form = self._hidden_inputs(login_page)
114
115 def req(url, f_req, note, errnote):
116 data = login_form.copy()
117 data.update({
118 'pstMsg': 1,
119 'checkConnection': 'youtube',
120 'checkedDomains': 'youtube',
121 'hl': 'en',
122 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
123 'f.req': json.dumps(f_req),
124 'flowName': 'GlifWebSignIn',
125 'flowEntry': 'ServiceLogin',
126 # TODO: reverse actual botguard identifier generation algo
127 'bgRequest': '["identifier",""]',
128 })
129 return self._download_json(
130 url, None, note=note, errnote=errnote,
131 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
132 fatal=False,
133 data=urlencode_postdata(data), headers={
134 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
135 'Google-Accounts-XSRF': 1,
136 })
137
138 def warn(message):
139 self._downloader.report_warning(message)
140
141 lookup_req = [
142 username,
143 None, [], None, 'US', None, None, 2, False, True,
144 [
145 None, None,
146 [2, 1, None, 1,
147 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
148 None, [], 4],
149 1, [None, None, []], None, None, None, True
150 ],
151 username,
152 ]
153
154 lookup_results = req(
155 self._LOOKUP_URL, lookup_req,
156 'Looking up account info', 'Unable to look up account info')
157
158 if lookup_results is False:
159 return False
160
161 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
162 if not user_hash:
163 warn('Unable to extract user hash')
164 return False
165
166 challenge_req = [
167 user_hash,
168 None, 1, None, [1, None, None, None, [password, None, True]],
169 [
170 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
171 1, [None, None, []], None, None, None, True
172 ]]
173
174 challenge_results = req(
175 self._CHALLENGE_URL, challenge_req,
176 'Logging in', 'Unable to log in')
177
178 if challenge_results is False:
179 return
180
181 login_res = try_get(challenge_results, lambda x: x[0][5], list)
182 if login_res:
183 login_msg = try_get(login_res, lambda x: x[5], compat_str)
184 warn(
185 'Unable to login: %s' % 'Invalid password'
186 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
187 return False
188
189 res = try_get(challenge_results, lambda x: x[0][-1], list)
190 if not res:
191 warn('Unable to extract result entry')
192 return False
193
194 login_challenge = try_get(res, lambda x: x[0][0], list)
195 if login_challenge:
196 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
197 if challenge_str == 'TWO_STEP_VERIFICATION':
198 # SEND_SUCCESS - TFA code has been successfully sent to phone
199 # QUOTA_EXCEEDED - reached the limit of TFA codes
200 status = try_get(login_challenge, lambda x: x[5], compat_str)
201 if status == 'QUOTA_EXCEEDED':
202 warn('Exceeded the limit of TFA codes, try later')
203 return False
204
205 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
206 if not tl:
207 warn('Unable to extract TL')
208 return False
209
210 tfa_code = self._get_tfa_info('2-step verification code')
211
212 if not tfa_code:
213 warn(
214 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
215 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
216 return False
217
218 tfa_code = remove_start(tfa_code, 'G-')
219
220 tfa_req = [
221 user_hash, None, 2, None,
222 [
223 9, None, None, None, None, None, None, None,
224 [None, tfa_code, True, 2]
225 ]]
226
227 tfa_results = req(
228 self._TFA_URL.format(tl), tfa_req,
229 'Submitting TFA code', 'Unable to submit TFA code')
230
231 if tfa_results is False:
232 return False
233
234 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
235 if tfa_res:
236 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
237 warn(
238 'Unable to finish TFA: %s' % 'Invalid TFA code'
239 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
240 return False
241
242 check_cookie_url = try_get(
243 tfa_results, lambda x: x[0][-1][2], compat_str)
244 else:
245 CHALLENGES = {
246 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
247 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
248 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
249 }
250 challenge = CHALLENGES.get(
251 challenge_str,
252 '%s returned error %s.' % (self.IE_NAME, challenge_str))
253 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
254 return False
255 else:
256 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
257
258 if not check_cookie_url:
259 warn('Unable to extract CheckCookie URL')
260 return False
261
262 check_cookie_results = self._download_webpage(
263 check_cookie_url, None, 'Checking cookie', fatal=False)
264
265 if check_cookie_results is False:
266 return False
267
268 if 'https://myaccount.google.com/' not in check_cookie_results:
269 warn('Unable to log in')
270 return False
271
272 return True
273
274 def _download_webpage_handle(self, *args, **kwargs):
275 query = kwargs.get('query', {}).copy()
276 kwargs['query'] = query
277 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
278 *args, **compat_kwargs(kwargs))
279
280 def _get_yt_initial_data(self, video_id, webpage):
281 config = self._search_regex(
282 (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
283 r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
284 webpage, 'ytInitialData', default=None)
285 if config:
286 return self._parse_json(
287 uppercase_escape(config), video_id, fatal=False)
288
289 def _real_initialize(self):
290 if self._downloader is None:
291 return
292 self._set_language()
293 if not self._login():
294 return
295
296 _DEFAULT_API_DATA = {
297 'context': {
298 'client': {
299 'clientName': 'WEB',
300 'clientVersion': '2.20201021.03.00',
301 }
302 },
303 }
304
305 _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
306 _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
307 _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
308
309 def _call_api(self, ep, query, video_id):
310 data = self._DEFAULT_API_DATA.copy()
311 data.update(query)
312
313 response = self._download_json(
314 'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id,
315 note='Downloading API JSON', errnote='Unable to download API page',
316 data=json.dumps(data).encode('utf8'),
317 headers={'content-type': 'application/json'},
318 query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'})
319
320 return response
321
322 def _extract_yt_initial_data(self, video_id, webpage):
323 return self._parse_json(
324 self._search_regex(
325 (r'%s\s*%s' % (self._YT_INITIAL_DATA_RE, self._YT_INITIAL_BOUNDARY_RE),
326 self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
327 video_id)
328
329 def _extract_ytcfg(self, video_id, webpage):
330 return self._parse_json(
331 self._search_regex(
332 r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg',
333 default='{}'), video_id, fatal=False)
334
335 def _extract_video(self, renderer):
336 video_id = renderer.get('videoId')
337 title = try_get(
338 renderer,
339 (lambda x: x['title']['runs'][0]['text'],
340 lambda x: x['title']['simpleText']), compat_str)
341 description = try_get(
342 renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'],
343 compat_str)
344 duration = parse_duration(try_get(
345 renderer, lambda x: x['lengthText']['simpleText'], compat_str))
346 view_count_text = try_get(
347 renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
348 view_count = str_to_int(self._search_regex(
349 r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
350 'view count', default=None))
351 uploader = try_get(
352 renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
353 return {
354 '_type': 'url_transparent',
355 'ie_key': YoutubeIE.ie_key(),
356 'id': video_id,
357 'url': video_id,
358 'title': title,
359 'description': description,
360 'duration': duration,
361 'view_count': view_count,
362 'uploader': uploader,
363 }
364
365
366 class YoutubeIE(YoutubeBaseInfoExtractor):
367 IE_DESC = 'YouTube.com'
368 _VALID_URL = r"""(?x)^
369 (
370 (?:https?://|//) # http(s):// or protocol-independent URL
371 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
372 (?:www\.)?deturl\.com/www\.youtube\.com/|
373 (?:www\.)?pwnyoutube\.com/|
374 (?:www\.)?hooktube\.com/|
375 (?:www\.)?yourepeat\.com/|
376 tube\.majestyc\.net/|
377 # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
378 (?:(?:www|dev)\.)?invidio\.us/|
379 (?:(?:www|no)\.)?invidiou\.sh/|
380 (?:(?:www|fi)\.)?invidious\.snopyta\.org/|
381 (?:www\.)?invidious\.kabi\.tk/|
382 (?:www\.)?invidious\.13ad\.de/|
383 (?:www\.)?invidious\.mastodon\.host/|
384 (?:www\.)?invidious\.zapashcanon\.fr/|
385 (?:www\.)?invidious\.kavin\.rocks/|
386 (?:www\.)?invidious\.tube/|
387 (?:www\.)?invidiou\.site/|
388 (?:www\.)?invidious\.site/|
389 (?:www\.)?invidious\.xyz/|
390 (?:www\.)?invidious\.nixnet\.xyz/|
391 (?:www\.)?invidious\.drycat\.fr/|
392 (?:www\.)?tube\.poal\.co/|
393 (?:www\.)?tube\.connect\.cafe/|
394 (?:www\.)?vid\.wxzm\.sx/|
395 (?:www\.)?vid\.mint\.lgbt/|
396 (?:www\.)?yewtu\.be/|
397 (?:www\.)?yt\.elukerio\.org/|
398 (?:www\.)?yt\.lelux\.fi/|
399 (?:www\.)?invidious\.ggc-project\.de/|
400 (?:www\.)?yt\.maisputain\.ovh/|
401 (?:www\.)?invidious\.13ad\.de/|
402 (?:www\.)?invidious\.toot\.koeln/|
403 (?:www\.)?invidious\.fdn\.fr/|
404 (?:www\.)?watch\.nettohikari\.com/|
405 (?:www\.)?kgg2m7yk5aybusll\.onion/|
406 (?:www\.)?qklhadlycap4cnod\.onion/|
407 (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
408 (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
409 (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
410 (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
411 (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
412 (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
413 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
414 (?:.*?\#/)? # handle anchor (#/) redirect urls
415 (?: # the various things that can precede the ID:
416 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
417 |(?: # or the v= param in all its forms
418 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
419 (?:\?|\#!?) # the params delimiter ? or # or #!
420 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
421 v=
422 )
423 ))
424 |(?:
425 youtu\.be| # just youtu.be/xxxx
426 vid\.plus| # or vid.plus/xxxx
427 zwearz\.com/watch| # or zwearz.com/watch/xxxx
428 )/
429 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
430 )
431 )? # all until now is optional -> you can pass the naked ID
432 (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
433 (?!.*?\blist=
434 (?:
435 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
436 WL # WL are handled by the watch later IE
437 )
438 )
439 (?(1).+)? # if we found the ID, everything can follow
440 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
441 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
442 _PLAYER_INFO_RE = (
443 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P<ext>[a-z]+)$',
444 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.(?P<ext>[a-z]+)$',
445 )
446 _formats = {
447 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
448 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
449 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
450 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
451 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
452 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
453 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
454 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
455 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
456 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
457 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
458 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
459 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
460 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
461 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
462 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
463 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
464 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
465
466
467 # 3D videos
468 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
469 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
470 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
471 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
472 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
473 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
474 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
475
476 # Apple HTTP Live Streaming
477 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
478 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
479 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
480 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
481 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
482 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
483 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
484 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
485
486 # DASH mp4 video
487 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
488 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
489 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
490 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
491 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
492 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
493 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
494 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
495 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
496 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
497 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
498 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
499
500 # Dash mp4 audio
501 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
502 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
503 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
504 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
505 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
506 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
507 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
508
509 # Dash webm
510 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
511 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
512 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
513 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
514 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
515 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
516 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
517 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
518 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
519 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
520 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
521 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
522 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
523 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
524 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
525 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
526 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
527 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
528 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
529 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
530 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
531 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
532
533 # Dash webm audio
534 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
535 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
536
537 # Dash webm audio with opus inside
538 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
539 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
540 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
541
542 # RTMP (unnamed)
543 '_rtmp': {'protocol': 'rtmp'},
544
545 # av01 video only formats sometimes served with "unknown" codecs
546 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
547 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
548 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
549 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
550 }
551 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
552
553 _GEO_BYPASS = False
554
555 IE_NAME = 'youtube'
556 _TESTS = [
557 {
558 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
559 'info_dict': {
560 'id': 'BaW_jenozKc',
561 'ext': 'mp4',
562 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
563 'uploader': 'Philipp Hagemeister',
564 'uploader_id': 'phihag',
565 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
566 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
567 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
568 'upload_date': '20121002',
569 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
570 'categories': ['Science & Technology'],
571 'tags': ['youtube-dl'],
572 'duration': 10,
573 'view_count': int,
574 'like_count': int,
575 'dislike_count': int,
576 'start_time': 1,
577 'end_time': 9,
578 }
579 },
580 {
581 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
582 'note': 'Embed-only video (#1746)',
583 'info_dict': {
584 'id': 'yZIXLfi8CZQ',
585 'ext': 'mp4',
586 'upload_date': '20120608',
587 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
588 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
589 'uploader': 'SET India',
590 'uploader_id': 'setindia',
591 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
592 'age_limit': 18,
593 }
594 },
595 {
596 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ',
597 'note': 'Use the first video ID in the URL',
598 'info_dict': {
599 'id': 'BaW_jenozKc',
600 'ext': 'mp4',
601 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
602 'uploader': 'Philipp Hagemeister',
603 'uploader_id': 'phihag',
604 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
605 'upload_date': '20121002',
606 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
607 'categories': ['Science & Technology'],
608 'tags': ['youtube-dl'],
609 'duration': 10,
610 'view_count': int,
611 'like_count': int,
612 'dislike_count': int,
613 },
614 'params': {
615 'skip_download': True,
616 },
617 },
618 {
619 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
620 'note': '256k DASH audio (format 141) via DASH manifest',
621 'info_dict': {
622 'id': 'a9LDPn-MO4I',
623 'ext': 'm4a',
624 'upload_date': '20121002',
625 'uploader_id': '8KVIDEO',
626 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
627 'description': '',
628 'uploader': '8KVIDEO',
629 'title': 'UHDTV TEST 8K VIDEO.mp4'
630 },
631 'params': {
632 'youtube_include_dash_manifest': True,
633 'format': '141',
634 },
635 'skip': 'format 141 not served anymore',
636 },
637 # DASH manifest with encrypted signature
638 {
639 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
640 'info_dict': {
641 'id': 'IB3lcPjvWLA',
642 'ext': 'm4a',
643 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
644 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
645 'duration': 244,
646 'uploader': 'AfrojackVEVO',
647 'uploader_id': 'AfrojackVEVO',
648 'upload_date': '20131011',
649 },
650 'params': {
651 'youtube_include_dash_manifest': True,
652 'format': '141/bestaudio[ext=m4a]',
653 },
654 },
655 # Controversy video
656 {
657 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
658 'info_dict': {
659 'id': 'T4XJQO3qol8',
660 'ext': 'mp4',
661 'duration': 219,
662 'upload_date': '20100909',
663 'uploader': 'Amazing Atheist',
664 'uploader_id': 'TheAmazingAtheist',
665 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
666 'title': 'Burning Everyone\'s Koran',
667 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
668 }
669 },
670 # Normal age-gate video (embed allowed)
671 {
672 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
673 'info_dict': {
674 'id': 'HtVdAasjOgU',
675 'ext': 'mp4',
676 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
677 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
678 'duration': 142,
679 'uploader': 'The Witcher',
680 'uploader_id': 'WitcherGame',
681 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
682 'upload_date': '20140605',
683 'age_limit': 18,
684 },
685 },
686 # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
687 # YouTube Red ad is not captured for creator
688 {
689 'url': '__2ABJjxzNo',
690 'info_dict': {
691 'id': '__2ABJjxzNo',
692 'ext': 'mp4',
693 'duration': 266,
694 'upload_date': '20100430',
695 'uploader_id': 'deadmau5',
696 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
697 'creator': 'Dada Life, deadmau5',
698 'description': 'md5:12c56784b8032162bb936a5f76d55360',
699 'uploader': 'deadmau5',
700 'title': 'Deadmau5 - Some Chords (HD)',
701 'alt_title': 'This Machine Kills Some Chords',
702 },
703 'expected_warnings': [
704 'DASH manifest missing',
705 ]
706 },
707 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
708 {
709 'url': 'lqQg6PlCWgI',
710 'info_dict': {
711 'id': 'lqQg6PlCWgI',
712 'ext': 'mp4',
713 'duration': 6085,
714 'upload_date': '20150827',
715 'uploader_id': 'olympic',
716 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
717 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
718 'uploader': 'Olympic',
719 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
720 },
721 'params': {
722 'skip_download': 'requires avconv',
723 }
724 },
725 # Non-square pixels
726 {
727 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
728 'info_dict': {
729 'id': '_b-2C3KPAM0',
730 'ext': 'mp4',
731 'stretched_ratio': 16 / 9.,
732 'duration': 85,
733 'upload_date': '20110310',
734 'uploader_id': 'AllenMeow',
735 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
736 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
737 'uploader': '孫ᄋᄅ',
738 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
739 },
740 },
741 # url_encoded_fmt_stream_map is empty string
742 {
743 'url': 'qEJwOuvDf7I',
744 'info_dict': {
745 'id': 'qEJwOuvDf7I',
746 'ext': 'webm',
747 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
748 'description': '',
749 'upload_date': '20150404',
750 'uploader_id': 'spbelect',
751 'uploader': 'Наблюдатели Петербурга',
752 },
753 'params': {
754 'skip_download': 'requires avconv',
755 },
756 'skip': 'This live event has ended.',
757 },
758 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
759 {
760 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
761 'info_dict': {
762 'id': 'FIl7x6_3R5Y',
763 'ext': 'webm',
764 'title': 'md5:7b81415841e02ecd4313668cde88737a',
765 'description': 'md5:116377fd2963b81ec4ce64b542173306',
766 'duration': 220,
767 'upload_date': '20150625',
768 'uploader_id': 'dorappi2000',
769 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
770 'uploader': 'dorappi2000',
771 'formats': 'mincount:31',
772 },
773 'skip': 'not actual anymore',
774 },
775 # DASH manifest with segment_list
776 {
777 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
778 'md5': '8ce563a1d667b599d21064e982ab9e31',
779 'info_dict': {
780 'id': 'CsmdDsKjzN8',
781 'ext': 'mp4',
782 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
783 'uploader': 'Airtek',
784 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
785 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
786 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
787 },
788 'params': {
789 'youtube_include_dash_manifest': True,
790 'format': '135', # bestvideo
791 },
792 'skip': 'This live event has ended.',
793 },
794 {
795 # Multifeed videos (multiple cameras), URL is for Main Camera
796 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
797 'info_dict': {
798 'id': 'jqWvoWXjCVs',
799 'title': 'teamPGP: Rocket League Noob Stream',
800 'description': 'md5:dc7872fb300e143831327f1bae3af010',
801 },
802 'playlist': [{
803 'info_dict': {
804 'id': 'jqWvoWXjCVs',
805 'ext': 'mp4',
806 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
807 'description': 'md5:dc7872fb300e143831327f1bae3af010',
808 'duration': 7335,
809 'upload_date': '20150721',
810 'uploader': 'Beer Games Beer',
811 'uploader_id': 'beergamesbeer',
812 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
813 'license': 'Standard YouTube License',
814 },
815 }, {
816 'info_dict': {
817 'id': '6h8e8xoXJzg',
818 'ext': 'mp4',
819 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
820 'description': 'md5:dc7872fb300e143831327f1bae3af010',
821 'duration': 7337,
822 'upload_date': '20150721',
823 'uploader': 'Beer Games Beer',
824 'uploader_id': 'beergamesbeer',
825 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
826 'license': 'Standard YouTube License',
827 },
828 }, {
829 'info_dict': {
830 'id': 'PUOgX5z9xZw',
831 'ext': 'mp4',
832 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
833 'description': 'md5:dc7872fb300e143831327f1bae3af010',
834 'duration': 7337,
835 'upload_date': '20150721',
836 'uploader': 'Beer Games Beer',
837 'uploader_id': 'beergamesbeer',
838 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
839 'license': 'Standard YouTube License',
840 },
841 }, {
842 'info_dict': {
843 'id': 'teuwxikvS5k',
844 'ext': 'mp4',
845 'title': 'teamPGP: Rocket League Noob Stream (zim)',
846 'description': 'md5:dc7872fb300e143831327f1bae3af010',
847 'duration': 7334,
848 'upload_date': '20150721',
849 'uploader': 'Beer Games Beer',
850 'uploader_id': 'beergamesbeer',
851 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
852 'license': 'Standard YouTube License',
853 },
854 }],
855 'params': {
856 'skip_download': True,
857 },
858 'skip': 'This video is not available.',
859 },
860 {
861 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
862 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
863 'info_dict': {
864 'id': 'gVfLd0zydlo',
865 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
866 },
867 'playlist_count': 2,
868 'skip': 'Not multifeed anymore',
869 },
870 {
871 'url': 'https://vid.plus/FlRa-iH7PGw',
872 'only_matching': True,
873 },
874 {
875 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
876 'only_matching': True,
877 },
878 {
879 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
880 # Also tests cut-off URL expansion in video description (see
881 # https://github.com/ytdl-org/youtube-dl/issues/1892,
882 # https://github.com/ytdl-org/youtube-dl/issues/8164)
883 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
884 'info_dict': {
885 'id': 'lsguqyKfVQg',
886 'ext': 'mp4',
887 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
888 'alt_title': 'Dark Walk - Position Music',
889 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
890 'duration': 133,
891 'upload_date': '20151119',
892 'uploader_id': 'IronSoulElf',
893 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
894 'uploader': 'IronSoulElf',
895 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
896 'track': 'Dark Walk - Position Music',
897 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
898 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
899 },
900 'params': {
901 'skip_download': True,
902 },
903 },
904 {
905 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
906 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
907 'only_matching': True,
908 },
909 {
910 # Video with yt:stretch=17:0
911 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
912 'info_dict': {
913 'id': 'Q39EVAstoRM',
914 'ext': 'mp4',
915 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
916 'description': 'md5:ee18a25c350637c8faff806845bddee9',
917 'upload_date': '20151107',
918 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
919 'uploader': 'CH GAMER DROID',
920 },
921 'params': {
922 'skip_download': True,
923 },
924 'skip': 'This video does not exist.',
925 },
926 {
927 # Video licensed under Creative Commons
928 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
929 'info_dict': {
930 'id': 'M4gD1WSo5mA',
931 'ext': 'mp4',
932 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
933 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
934 'duration': 721,
935 'upload_date': '20150127',
936 'uploader_id': 'BerkmanCenter',
937 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
938 'uploader': 'The Berkman Klein Center for Internet & Society',
939 'license': 'Creative Commons Attribution license (reuse allowed)',
940 },
941 'params': {
942 'skip_download': True,
943 },
944 },
945 {
946 # Channel-like uploader_url
947 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
948 'info_dict': {
949 'id': 'eQcmzGIKrzg',
950 'ext': 'mp4',
951 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
952 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
953 'duration': 4060,
954 'upload_date': '20151119',
955 'uploader': 'Bernie Sanders',
956 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
957 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
958 'license': 'Creative Commons Attribution license (reuse allowed)',
959 },
960 'params': {
961 'skip_download': True,
962 },
963 },
964 {
965 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
966 'only_matching': True,
967 },
968 {
969 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
970 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
971 'only_matching': True,
972 },
973 {
974 # Rental video preview
975 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
976 'info_dict': {
977 'id': 'uGpuVWrhIzE',
978 'ext': 'mp4',
979 'title': 'Piku - Trailer',
980 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
981 'upload_date': '20150811',
982 'uploader': 'FlixMatrix',
983 'uploader_id': 'FlixMatrixKaravan',
984 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
985 'license': 'Standard YouTube License',
986 },
987 'params': {
988 'skip_download': True,
989 },
990 'skip': 'This video is not available.',
991 },
992 {
993 # YouTube Red video with episode data
994 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
995 'info_dict': {
996 'id': 'iqKdEhx-dD4',
997 'ext': 'mp4',
998 'title': 'Isolation - Mind Field (Ep 1)',
999 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
1000 'duration': 2085,
1001 'upload_date': '20170118',
1002 'uploader': 'Vsauce',
1003 'uploader_id': 'Vsauce',
1004 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
1005 'series': 'Mind Field',
1006 'season_number': 1,
1007 'episode_number': 1,
1008 },
1009 'params': {
1010 'skip_download': True,
1011 },
1012 'expected_warnings': [
1013 'Skipping DASH manifest',
1014 ],
1015 },
1016 {
1017 # The following content has been identified by the YouTube community
1018 # as inappropriate or offensive to some audiences.
1019 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
1020 'info_dict': {
1021 'id': '6SJNVb0GnPI',
1022 'ext': 'mp4',
1023 'title': 'Race Differences in Intelligence',
1024 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
1025 'duration': 965,
1026 'upload_date': '20140124',
1027 'uploader': 'New Century Foundation',
1028 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
1029 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
1030 },
1031 'params': {
1032 'skip_download': True,
1033 },
1034 },
1035 {
1036 # itag 212
1037 'url': '1t24XAntNCY',
1038 'only_matching': True,
1039 },
1040 {
1041 # geo restricted to JP
1042 'url': 'sJL6WA-aGkQ',
1043 'only_matching': True,
1044 },
1045 {
1046 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1047 'only_matching': True,
1048 },
1049 {
1050 # DRM protected
1051 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1052 'only_matching': True,
1053 },
1054 {
1055 # Video with unsupported adaptive stream type formats
1056 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1057 'info_dict': {
1058 'id': 'Z4Vy8R84T1U',
1059 'ext': 'mp4',
1060 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1061 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1062 'duration': 433,
1063 'upload_date': '20130923',
1064 'uploader': 'Amelia Putri Harwita',
1065 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1066 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1067 'formats': 'maxcount:10',
1068 },
1069 'params': {
1070 'skip_download': True,
1071 'youtube_include_dash_manifest': False,
1072 },
1073 'skip': 'not actual anymore',
1074 },
1075 {
1076 # Youtube Music Auto-generated description
1077 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1078 'info_dict': {
1079 'id': 'MgNrAu2pzNs',
1080 'ext': 'mp4',
1081 'title': 'Voyeur Girl',
1082 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1083 'upload_date': '20190312',
1084 'uploader': 'Stephen - Topic',
1085 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
1086 'artist': 'Stephen',
1087 'track': 'Voyeur Girl',
1088 'album': 'it\'s too much love to know my dear',
1089 'release_date': '20190313',
1090 'release_year': 2019,
1091 },
1092 'params': {
1093 'skip_download': True,
1094 },
1095 },
1096 {
1097 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1098 'only_matching': True,
1099 },
1100 {
1101 # invalid -> valid video id redirection
1102 'url': 'DJztXj2GPfl',
1103 'info_dict': {
1104 'id': 'DJztXj2GPfk',
1105 'ext': 'mp4',
1106 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1107 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1108 'upload_date': '20090125',
1109 'uploader': 'Prochorowka',
1110 'uploader_id': 'Prochorowka',
1111 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1112 'artist': 'Panjabi MC',
1113 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1114 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1115 },
1116 'params': {
1117 'skip_download': True,
1118 },
1119 },
1120 {
1121 # empty description results in an empty string
1122 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1123 'info_dict': {
1124 'id': 'x41yOUIvK2k',
1125 'ext': 'mp4',
1126 'title': 'IMG 3456',
1127 'description': '',
1128 'upload_date': '20170613',
1129 'uploader_id': 'ElevageOrVert',
1130 'uploader': 'ElevageOrVert',
1131 },
1132 'params': {
1133 'skip_download': True,
1134 },
1135 },
1136 {
1137 # with '};' inside yt initial data (see [1])
1138 # see [2] for an example with '};' inside ytInitialPlayerResponse
1139 # 1. https://github.com/ytdl-org/youtube-dl/issues/27093
1140 # 2. https://github.com/ytdl-org/youtube-dl/issues/27216
1141 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no',
1142 'info_dict': {
1143 'id': 'CHqg6qOn4no',
1144 'ext': 'mp4',
1145 'title': 'Part 77 Sort a list of simple types in c#',
1146 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc',
1147 'upload_date': '20130831',
1148 'uploader_id': 'kudvenkat',
1149 'uploader': 'kudvenkat',
1150 },
1151 'params': {
1152 'skip_download': True,
1153 },
1154 },
1155 {
1156 # another example of '};' in ytInitialData
1157 'url': 'https://www.youtube.com/watch?v=gVfgbahppCY',
1158 'only_matching': True,
1159 },
1160 {
1161 'url': 'https://www.youtube.com/watch_popup?v=63RmMXCd_bQ',
1162 'only_matching': True,
1163 },
1164 ]
1165
1166 def __init__(self, *args, **kwargs):
1167 super(YoutubeIE, self).__init__(*args, **kwargs)
1168 self._player_cache = {}
1169
1170 def report_video_info_webpage_download(self, video_id):
1171 """Report attempt to download video info webpage."""
1172 self.to_screen('%s: Downloading video info webpage' % video_id)
1173
1174 def report_information_extraction(self, video_id):
1175 """Report attempt to extract video information."""
1176 self.to_screen('%s: Extracting video information' % video_id)
1177
1178 def report_unavailable_format(self, video_id, format):
1179 """Report extracted video URL."""
1180 self.to_screen('%s: Format %s not available' % (video_id, format))
1181
1182 def report_rtmp_download(self):
1183 """Indicate the download will use the RTMP protocol."""
1184 self.to_screen('RTMP download detected')
1185
1186 def _signature_cache_id(self, example_sig):
1187 """ Return a string representation of a signature """
1188 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
1189
1190 @classmethod
1191 def _extract_player_info(cls, player_url):
1192 for player_re in cls._PLAYER_INFO_RE:
1193 id_m = re.search(player_re, player_url)
1194 if id_m:
1195 break
1196 else:
1197 raise ExtractorError('Cannot identify player %r' % player_url)
1198 return id_m.group('ext'), id_m.group('id')
1199
1200 def _extract_signature_function(self, video_id, player_url, example_sig):
1201 player_type, player_id = self._extract_player_info(player_url)
1202
1203 # Read from filesystem cache
1204 func_id = '%s_%s_%s' % (
1205 player_type, player_id, self._signature_cache_id(example_sig))
1206 assert os.path.basename(func_id) == func_id
1207
1208 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
1209 if cache_spec is not None:
1210 return lambda s: ''.join(s[i] for i in cache_spec)
1211
1212 download_note = (
1213 'Downloading player %s' % player_url
1214 if self._downloader.params.get('verbose') else
1215 'Downloading %s player %s' % (player_type, player_id)
1216 )
1217 if player_type == 'js':
1218 code = self._download_webpage(
1219 player_url, video_id,
1220 note=download_note,
1221 errnote='Download of %s failed' % player_url)
1222 res = self._parse_sig_js(code)
1223 elif player_type == 'swf':
1224 urlh = self._request_webpage(
1225 player_url, video_id,
1226 note=download_note,
1227 errnote='Download of %s failed' % player_url)
1228 code = urlh.read()
1229 res = self._parse_sig_swf(code)
1230 else:
1231 assert False, 'Invalid player type %r' % player_type
1232
1233 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1234 cache_res = res(test_string)
1235 cache_spec = [ord(c) for c in cache_res]
1236
1237 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
1238 return res
1239
1240 def _print_sig_code(self, func, example_sig):
1241 def gen_sig_code(idxs):
1242 def _genslice(start, end, step):
1243 starts = '' if start == 0 else str(start)
1244 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
1245 steps = '' if step == 1 else (':%d' % step)
1246 return 's[%s%s%s]' % (starts, ends, steps)
1247
1248 step = None
1249 # Quelch pyflakes warnings - start will be set when step is set
1250 start = '(Never used)'
1251 for i, prev in zip(idxs[1:], idxs[:-1]):
1252 if step is not None:
1253 if i - prev == step:
1254 continue
1255 yield _genslice(start, prev, step)
1256 step = None
1257 continue
1258 if i - prev in [-1, 1]:
1259 step = i - prev
1260 start = prev
1261 continue
1262 else:
1263 yield 's[%d]' % prev
1264 if step is None:
1265 yield 's[%d]' % i
1266 else:
1267 yield _genslice(start, i, step)
1268
1269 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1270 cache_res = func(test_string)
1271 cache_spec = [ord(c) for c in cache_res]
1272 expr_code = ' + '.join(gen_sig_code(cache_spec))
1273 signature_id_tuple = '(%s)' % (
1274 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
1275 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
1276 ' return %s\n') % (signature_id_tuple, expr_code)
1277 self.to_screen('Extracted signature function:\n' + code)
1278
1279 def _parse_sig_js(self, jscode):
1280 funcname = self._search_regex(
1281 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1282 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1283 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1284 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1285 # Obsolete patterns
1286 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1287 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
1288 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1289 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1290 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1291 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1292 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1293 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
1294 jscode, 'Initial JS player signature function name', group='sig')
1295
1296 jsi = JSInterpreter(jscode)
1297 initial_function = jsi.extract_function(funcname)
1298 return lambda s: initial_function([s])
1299
1300 def _parse_sig_swf(self, file_contents):
1301 swfi = SWFInterpreter(file_contents)
1302 TARGET_CLASSNAME = 'SignatureDecipher'
1303 searched_class = swfi.extract_class(TARGET_CLASSNAME)
1304 initial_function = swfi.extract_function(searched_class, 'decipher')
1305 return lambda s: initial_function([s])
1306
1307 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1308 """Turn the encrypted s field into a working signature"""
1309
1310 if player_url is None:
1311 raise ExtractorError('Cannot decrypt signature without player_url')
1312
1313 if player_url.startswith('//'):
1314 player_url = 'https:' + player_url
1315 elif not re.match(r'https?://', player_url):
1316 player_url = compat_urlparse.urljoin(
1317 'https://www.youtube.com', player_url)
1318 try:
1319 player_id = (player_url, self._signature_cache_id(s))
1320 if player_id not in self._player_cache:
1321 func = self._extract_signature_function(
1322 video_id, player_url, s
1323 )
1324 self._player_cache[player_id] = func
1325 func = self._player_cache[player_id]
1326 if self._downloader.params.get('youtube_print_sig_code'):
1327 self._print_sig_code(func, s)
1328 return func(s)
1329 except Exception as e:
1330 tb = traceback.format_exc()
1331 raise ExtractorError(
1332 'Signature extraction failed: ' + tb, cause=e)
1333
1334 def _get_subtitles(self, video_id, webpage, has_live_chat_replay):
1335 try:
1336 subs_doc = self._download_xml(
1337 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1338 video_id, note=False)
1339 except ExtractorError as err:
1340 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
1341 return {}
1342
1343 sub_lang_list = {}
1344 for track in subs_doc.findall('track'):
1345 lang = track.attrib['lang_code']
1346 if lang in sub_lang_list:
1347 continue
1348 sub_formats = []
1349 for ext in self._SUBTITLE_FORMATS:
1350 params = compat_urllib_parse_urlencode({
1351 'lang': lang,
1352 'v': video_id,
1353 'fmt': ext,
1354 'name': track.attrib['name'].encode('utf-8'),
1355 })
1356 sub_formats.append({
1357 'url': 'https://www.youtube.com/api/timedtext?' + params,
1358 'ext': ext,
1359 })
1360 sub_lang_list[lang] = sub_formats
1361 if has_live_chat_replay:
1362 sub_lang_list['live_chat'] = [
1363 {
1364 'video_id': video_id,
1365 'ext': 'json',
1366 'protocol': 'youtube_live_chat_replay',
1367 },
1368 ]
1369 if not sub_lang_list:
1370 self._downloader.report_warning('video doesn\'t have subtitles')
1371 return {}
1372 return sub_lang_list
1373
1374 def _get_ytplayer_config(self, video_id, webpage):
1375 patterns = (
1376 # User data may contain arbitrary character sequences that may affect
1377 # JSON extraction with regex, e.g. when '};' is contained the second
1378 # regex won't capture the whole JSON. Yet working around by trying more
1379 # concrete regex first keeping in mind proper quoted string handling
1380 # to be implemented in future that will replace this workaround (see
1381 # https://github.com/ytdl-org/youtube-dl/issues/7468,
1382 # https://github.com/ytdl-org/youtube-dl/pull/7599)
1383 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1384 r';ytplayer\.config\s*=\s*({.+?});',
1385 )
1386 config = self._search_regex(
1387 patterns, webpage, 'ytplayer.config', default=None)
1388 if config:
1389 return self._parse_json(
1390 uppercase_escape(config), video_id, fatal=False)
1391
1392 def _get_automatic_captions(self, video_id, player_response, player_config):
1393 """We need the webpage for getting the captions url, pass it as an
1394 argument to speed up the process."""
1395 self.to_screen('%s: Looking for automatic captions' % video_id)
1396 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
1397 if not (player_response or player_config):
1398 self._downloader.report_warning(err_msg)
1399 return {}
1400 try:
1401 args = player_config.get('args') if player_config else {}
1402 caption_url = args.get('ttsurl')
1403 if caption_url:
1404 timestamp = args['timestamp']
1405 # We get the available subtitles
1406 list_params = compat_urllib_parse_urlencode({
1407 'type': 'list',
1408 'tlangs': 1,
1409 'asrs': 1,
1410 })
1411 list_url = caption_url + '&' + list_params
1412 caption_list = self._download_xml(list_url, video_id)
1413 original_lang_node = caption_list.find('track')
1414 if original_lang_node is None:
1415 self._downloader.report_warning('Video doesn\'t have automatic captions')
1416 return {}
1417 original_lang = original_lang_node.attrib['lang_code']
1418 caption_kind = original_lang_node.attrib.get('kind', '')
1419
1420 sub_lang_list = {}
1421 for lang_node in caption_list.findall('target'):
1422 sub_lang = lang_node.attrib['lang_code']
1423 sub_formats = []
1424 for ext in self._SUBTITLE_FORMATS:
1425 params = compat_urllib_parse_urlencode({
1426 'lang': original_lang,
1427 'tlang': sub_lang,
1428 'fmt': ext,
1429 'ts': timestamp,
1430 'kind': caption_kind,
1431 })
1432 sub_formats.append({
1433 'url': caption_url + '&' + params,
1434 'ext': ext,
1435 })
1436 sub_lang_list[sub_lang] = sub_formats
1437 return sub_lang_list
1438
1439 def make_captions(sub_url, sub_langs):
1440 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1441 caption_qs = compat_parse_qs(parsed_sub_url.query)
1442 captions = {}
1443 for sub_lang in sub_langs:
1444 sub_formats = []
1445 for ext in self._SUBTITLE_FORMATS:
1446 caption_qs.update({
1447 'tlang': [sub_lang],
1448 'fmt': [ext],
1449 })
1450 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1451 query=compat_urllib_parse_urlencode(caption_qs, True)))
1452 sub_formats.append({
1453 'url': sub_url,
1454 'ext': ext,
1455 })
1456 captions[sub_lang] = sub_formats
1457 return captions
1458
1459 # New captions format as of 22.06.2017
1460 if player_response:
1461 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1462 base_url = renderer['captionTracks'][0]['baseUrl']
1463 sub_lang_list = []
1464 for lang in renderer['translationLanguages']:
1465 lang_code = lang.get('languageCode')
1466 if lang_code:
1467 sub_lang_list.append(lang_code)
1468 return make_captions(base_url, sub_lang_list)
1469
1470 # Some videos don't provide ttsurl but rather caption_tracks and
1471 # caption_translation_languages (e.g. 20LmZk1hakA)
1472 # Does not used anymore as of 22.06.2017
1473 caption_tracks = args['caption_tracks']
1474 caption_translation_languages = args['caption_translation_languages']
1475 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
1476 sub_lang_list = []
1477 for lang in caption_translation_languages.split(','):
1478 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1479 sub_lang = lang_qs.get('lc', [None])[0]
1480 if sub_lang:
1481 sub_lang_list.append(sub_lang)
1482 return make_captions(caption_url, sub_lang_list)
1483 # An extractor error can be raise by the download process if there are
1484 # no automatic captions but there are subtitles
1485 except (KeyError, IndexError, ExtractorError):
1486 self._downloader.report_warning(err_msg)
1487 return {}
1488
1489 def _mark_watched(self, video_id, video_info, player_response):
1490 playback_url = url_or_none(try_get(
1491 player_response,
1492 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
1493 video_info, lambda x: x['videostats_playback_base_url'][0]))
1494 if not playback_url:
1495 return
1496 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1497 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1498
1499 # cpn generation algorithm is reverse engineered from base.js.
1500 # In fact it works even with dummy cpn.
1501 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1502 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1503
1504 qs.update({
1505 'ver': ['2'],
1506 'cpn': [cpn],
1507 })
1508 playback_url = compat_urlparse.urlunparse(
1509 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
1510
1511 self._download_webpage(
1512 playback_url, video_id, 'Marking watched',
1513 'Unable to mark watched', fatal=False)
1514
1515 @staticmethod
1516 def _extract_urls(webpage):
1517 # Embedded YouTube player
1518 entries = [
1519 unescapeHTML(mobj.group('url'))
1520 for mobj in re.finditer(r'''(?x)
1521 (?:
1522 <iframe[^>]+?src=|
1523 data-video-url=|
1524 <embed[^>]+?src=|
1525 embedSWF\(?:\s*|
1526 <object[^>]+data=|
1527 new\s+SWFObject\(
1528 )
1529 (["\'])
1530 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1531 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
1532 \1''', webpage)]
1533
1534 # lazyYT YouTube embed
1535 entries.extend(list(map(
1536 unescapeHTML,
1537 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1538
1539 # Wordpress "YouTube Video Importer" plugin
1540 matches = re.findall(r'''(?x)<div[^>]+
1541 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1542 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1543 entries.extend(m[-1] for m in matches)
1544
1545 return entries
1546
1547 @staticmethod
1548 def _extract_url(webpage):
1549 urls = YoutubeIE._extract_urls(webpage)
1550 return urls[0] if urls else None
1551
1552 @classmethod
1553 def extract_id(cls, url):
1554 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1555 if mobj is None:
1556 raise ExtractorError('Invalid URL: %s' % url)
1557 video_id = mobj.group(2)
1558 return video_id
1559
1560 def _extract_chapters_from_json(self, webpage, video_id, duration):
1561 if not webpage:
1562 return
1563 data = self._extract_yt_initial_data(video_id, webpage)
1564 if not data or not isinstance(data, dict):
1565 return
1566 chapters_list = try_get(
1567 data,
1568 lambda x: x['playerOverlays']
1569 ['playerOverlayRenderer']
1570 ['decoratedPlayerBarRenderer']
1571 ['decoratedPlayerBarRenderer']
1572 ['playerBar']
1573 ['chapteredPlayerBarRenderer']
1574 ['chapters'],
1575 list)
1576 if not chapters_list:
1577 return
1578
1579 def chapter_time(chapter):
1580 return float_or_none(
1581 try_get(
1582 chapter,
1583 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1584 int),
1585 scale=1000)
1586 chapters = []
1587 for next_num, chapter in enumerate(chapters_list, start=1):
1588 start_time = chapter_time(chapter)
1589 if start_time is None:
1590 continue
1591 end_time = (chapter_time(chapters_list[next_num])
1592 if next_num < len(chapters_list) else duration)
1593 if end_time is None:
1594 continue
1595 title = try_get(
1596 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1597 compat_str)
1598 chapters.append({
1599 'start_time': start_time,
1600 'end_time': end_time,
1601 'title': title,
1602 })
1603 return chapters
1604
1605 @staticmethod
1606 def _extract_chapters_from_description(description, duration):
1607 if not description:
1608 return None
1609 chapter_lines = re.findall(
1610 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1611 description)
1612 if not chapter_lines:
1613 return None
1614 chapters = []
1615 for next_num, (chapter_line, time_point) in enumerate(
1616 chapter_lines, start=1):
1617 start_time = parse_duration(time_point)
1618 if start_time is None:
1619 continue
1620 if start_time > duration:
1621 break
1622 end_time = (duration if next_num == len(chapter_lines)
1623 else parse_duration(chapter_lines[next_num][1]))
1624 if end_time is None:
1625 continue
1626 if end_time > duration:
1627 end_time = duration
1628 if start_time > end_time:
1629 break
1630 chapter_title = re.sub(
1631 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1632 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1633 chapters.append({
1634 'start_time': start_time,
1635 'end_time': end_time,
1636 'title': chapter_title,
1637 })
1638 return chapters
1639
1640 def _extract_chapters(self, webpage, description, video_id, duration):
1641 return (self._extract_chapters_from_json(webpage, video_id, duration)
1642 or self._extract_chapters_from_description(description, duration))
1643
1644 def _real_extract(self, url):
1645 url, smuggled_data = unsmuggle_url(url, {})
1646
1647 proto = (
1648 'http' if self._downloader.params.get('prefer_insecure', False)
1649 else 'https')
1650
1651 start_time = None
1652 end_time = None
1653 parsed_url = compat_urllib_parse_urlparse(url)
1654 for component in [parsed_url.fragment, parsed_url.query]:
1655 query = compat_parse_qs(component)
1656 if start_time is None and 't' in query:
1657 start_time = parse_duration(query['t'][0])
1658 if start_time is None and 'start' in query:
1659 start_time = parse_duration(query['start'][0])
1660 if end_time is None and 'end' in query:
1661 end_time = parse_duration(query['end'][0])
1662
1663 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1664 mobj = re.search(self._NEXT_URL_RE, url)
1665 if mobj:
1666 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
1667 video_id = self.extract_id(url)
1668
1669 # Get video webpage
1670 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
1671 video_webpage, urlh = self._download_webpage_handle(url, video_id)
1672
1673 qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query)
1674 video_id = qs.get('v', [None])[0] or video_id
1675
1676 # Attempt to extract SWF player URL
1677 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1678 if mobj is not None:
1679 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1680 else:
1681 player_url = None
1682
1683 dash_mpds = []
1684
1685 def add_dash_mpd(video_info):
1686 dash_mpd = video_info.get('dashmpd')
1687 if dash_mpd and dash_mpd[0] not in dash_mpds:
1688 dash_mpds.append(dash_mpd[0])
1689
1690 def add_dash_mpd_pr(pl_response):
1691 dash_mpd = url_or_none(try_get(
1692 pl_response, lambda x: x['streamingData']['dashManifestUrl'],
1693 compat_str))
1694 if dash_mpd and dash_mpd not in dash_mpds:
1695 dash_mpds.append(dash_mpd)
1696
1697 is_live = None
1698 view_count = None
1699
1700 def extract_view_count(v_info):
1701 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1702
1703 def extract_player_response(player_response, video_id):
1704 pl_response = str_or_none(player_response)
1705 if not pl_response:
1706 return
1707 pl_response = self._parse_json(pl_response, video_id, fatal=False)
1708 if isinstance(pl_response, dict):
1709 add_dash_mpd_pr(pl_response)
1710 return pl_response
1711
1712 def extract_embedded_config(embed_webpage, video_id):
1713 embedded_config = self._search_regex(
1714 r'setConfig\(({.*})\);',
1715 embed_webpage, 'ytInitialData', default=None)
1716 if embedded_config:
1717 return embedded_config
1718
1719 video_info = {}
1720 player_response = {}
1721 ytplayer_config = None
1722 embed_webpage = None
1723
1724 # Get video info
1725 if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+'
1726 or re.search(r'player-age-gate-content">', video_webpage) is not None):
1727 cookie_keys = self._get_cookies('https://www.youtube.com').keys()
1728 age_gate = True
1729 # We simulate the access to the video from www.youtube.com/v/{video_id}
1730 # this can be viewed without login into Youtube
1731 url = proto + '://www.youtube.com/embed/%s' % video_id
1732 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
1733 ext = extract_embedded_config(embed_webpage, video_id)
1734 # playabilityStatus = re.search(r'{\\\"status\\\":\\\"(?P<playabilityStatus>[^\"]+)\\\"', ext)
1735 playable_in_embed = re.search(r'{\\\"playableInEmbed\\\":(?P<playableinEmbed>[^\,]+)', ext)
1736 if not playable_in_embed:
1737 self.to_screen('Could not determine whether playabale in embed for video %s' % video_id)
1738 playable_in_embed = ''
1739 else:
1740 playable_in_embed = playable_in_embed.group('playableinEmbed')
1741 # check if video is only playable on youtube in other words not playable in embed - if so it requires auth (cookies)
1742 # if re.search(r'player-unavailable">', embed_webpage) is not None:
1743 if playable_in_embed == 'false':
1744 '''
1745 # TODO apply this patch when Support for Python 2.6(!) and above drops
1746 if ({'VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID'} <= cookie_keys
1747 or {'VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO'} <= cookie_keys):
1748 '''
1749 if (set(('VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID')) <= set(cookie_keys)
1750 or set(('VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO')) <= set(cookie_keys)):
1751 age_gate = False
1752 # Try looking directly into the video webpage
1753 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1754 if ytplayer_config:
1755 args = ytplayer_config.get("args")
1756 if args is not None:
1757 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1758 # Convert to the same format returned by compat_parse_qs
1759 video_info = dict((k, [v]) for k, v in args.items())
1760 add_dash_mpd(video_info)
1761 # Rental video is not rented but preview is available (e.g.
1762 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1763 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1764 if not video_info and args.get('ypc_vid'):
1765 return self.url_result(
1766 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1767 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1768 is_live = True
1769 if not player_response:
1770 player_response = extract_player_response(args.get('player_response'), video_id)
1771 elif not player_response:
1772 player_response = ytplayer_config
1773 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1774 add_dash_mpd_pr(player_response)
1775 else:
1776 raise ExtractorError('Video is age restricted and only playable on Youtube. Requires cookies!', expected=True)
1777 else:
1778 data = compat_urllib_parse_urlencode({
1779 'video_id': video_id,
1780 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1781 'sts': self._search_regex(
1782 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1783 })
1784 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1785 try:
1786 video_info_webpage = self._download_webpage(
1787 video_info_url, video_id,
1788 note='Refetching age-gated info webpage',
1789 errnote='unable to download video info webpage')
1790 except ExtractorError:
1791 video_info_webpage = None
1792 if video_info_webpage:
1793 video_info = compat_parse_qs(video_info_webpage)
1794 pl_response = video_info.get('player_response', [None])[0]
1795 player_response = extract_player_response(pl_response, video_id)
1796 add_dash_mpd(video_info)
1797 view_count = extract_view_count(video_info)
1798 else:
1799 age_gate = False
1800 # Try looking directly into the video webpage
1801 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1802 if ytplayer_config:
1803 args = ytplayer_config.get('args', {})
1804 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1805 # Convert to the same format returned by compat_parse_qs
1806 video_info = dict((k, [v]) for k, v in args.items())
1807 add_dash_mpd(video_info)
1808 # Rental video is not rented but preview is available (e.g.
1809 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1810 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1811 if not video_info and args.get('ypc_vid'):
1812 return self.url_result(
1813 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1814 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1815 is_live = True
1816 if not player_response:
1817 player_response = extract_player_response(args.get('player_response'), video_id)
1818 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1819 add_dash_mpd_pr(player_response)
1820
1821 if not video_info and not player_response:
1822 player_response = extract_player_response(
1823 self._search_regex(
1824 (r'%s\s*%s' % (self._YT_INITIAL_PLAYER_RESPONSE_RE, self._YT_INITIAL_BOUNDARY_RE),
1825 self._YT_INITIAL_PLAYER_RESPONSE_RE), video_webpage,
1826 'initial player response', default='{}'),
1827 video_id)
1828
1829 def extract_unavailable_message():
1830 messages = []
1831 for tag, kind in (('h1', 'message'), ('div', 'submessage')):
1832 msg = self._html_search_regex(
1833 r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
1834 video_webpage, 'unavailable %s' % kind, default=None)
1835 if msg:
1836 messages.append(msg)
1837 if messages:
1838 return '\n'.join(messages)
1839
1840 if not video_info and not player_response:
1841 unavailable_message = extract_unavailable_message()
1842 if not unavailable_message:
1843 unavailable_message = 'Unable to extract video data'
1844 raise ExtractorError(
1845 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
1846
1847 if not isinstance(video_info, dict):
1848 video_info = {}
1849
1850 playable_in_embed = try_get(
1851 player_response, lambda x: x['playabilityStatus']['playableInEmbed'])
1852
1853 video_details = try_get(
1854 player_response, lambda x: x['videoDetails'], dict) or {}
1855
1856 microformat = try_get(
1857 player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}
1858
1859 video_title = video_info.get('title', [None])[0] or video_details.get('title')
1860 if not video_title:
1861 self._downloader.report_warning('Unable to extract video title')
1862 video_title = '_'
1863
1864 description_original = video_description = get_element_by_id("eow-description", video_webpage)
1865 if video_description:
1866
1867 def replace_url(m):
1868 redir_url = compat_urlparse.urljoin(url, m.group(1))
1869 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1870 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1871 qs = compat_parse_qs(parsed_redir_url.query)
1872 q = qs.get('q')
1873 if q and q[0]:
1874 return q[0]
1875 return redir_url
1876
1877 description_original = video_description = re.sub(r'''(?x)
1878 <a\s+
1879 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1880 (?:title|href)="([^"]+)"\s+
1881 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1882 class="[^"]*"[^>]*>
1883 [^<]+\.{3}\s*
1884 </a>
1885 ''', replace_url, video_description)
1886 video_description = clean_html(video_description)
1887 else:
1888 video_description = video_details.get('shortDescription')
1889 if video_description is None:
1890 video_description = self._html_search_meta('description', video_webpage)
1891
1892 if not smuggled_data.get('force_singlefeed', False):
1893 if not self._downloader.params.get('noplaylist'):
1894 multifeed_metadata_list = try_get(
1895 player_response,
1896 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
1897 compat_str) or try_get(
1898 video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
1899 if multifeed_metadata_list:
1900 entries = []
1901 feed_ids = []
1902 for feed in multifeed_metadata_list.split(','):
1903 # Unquote should take place before split on comma (,) since textual
1904 # fields may contain comma as well (see
1905 # https://github.com/ytdl-org/youtube-dl/issues/8536)
1906 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
1907
1908 def feed_entry(name):
1909 return try_get(feed_data, lambda x: x[name][0], compat_str)
1910
1911 feed_id = feed_entry('id')
1912 if not feed_id:
1913 continue
1914 feed_title = feed_entry('title')
1915 title = video_title
1916 if feed_title:
1917 title += ' (%s)' % feed_title
1918 entries.append({
1919 '_type': 'url_transparent',
1920 'ie_key': 'Youtube',
1921 'url': smuggle_url(
1922 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1923 {'force_singlefeed': True}),
1924 'title': title,
1925 })
1926 feed_ids.append(feed_id)
1927 self.to_screen(
1928 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1929 % (', '.join(feed_ids), video_id))
1930 return self.playlist_result(entries, video_id, video_title, video_description)
1931 else:
1932 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1933
1934 if view_count is None:
1935 view_count = extract_view_count(video_info)
1936 if view_count is None and video_details:
1937 view_count = int_or_none(video_details.get('viewCount'))
1938 if view_count is None and microformat:
1939 view_count = int_or_none(microformat.get('viewCount'))
1940
1941 if is_live is None:
1942 is_live = bool_or_none(video_details.get('isLive'))
1943
1944 has_live_chat_replay = False
1945 if not is_live:
1946 yt_initial_data = self._get_yt_initial_data(video_id, video_webpage)
1947 try:
1948 yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
1949 has_live_chat_replay = True
1950 except (KeyError, IndexError, TypeError):
1951 pass
1952
1953 # Check for "rental" videos
1954 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1955 raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
1956
1957 def _extract_filesize(media_url):
1958 return int_or_none(self._search_regex(
1959 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
1960
1961 streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
1962 streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
1963
1964 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1965 self.report_rtmp_download()
1966 formats = [{
1967 'format_id': '_rtmp',
1968 'protocol': 'rtmp',
1969 'url': video_info['conn'][0],
1970 'player_url': player_url,
1971 }]
1972 elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
1973 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
1974 if 'rtmpe%3Dyes' in encoded_url_map:
1975 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
1976 formats = []
1977 formats_spec = {}
1978 fmt_list = video_info.get('fmt_list', [''])[0]
1979 if fmt_list:
1980 for fmt in fmt_list.split(','):
1981 spec = fmt.split('/')
1982 if len(spec) > 1:
1983 width_height = spec[1].split('x')
1984 if len(width_height) == 2:
1985 formats_spec[spec[0]] = {
1986 'resolution': spec[1],
1987 'width': int_or_none(width_height[0]),
1988 'height': int_or_none(width_height[1]),
1989 }
1990 for fmt in streaming_formats:
1991 itag = str_or_none(fmt.get('itag'))
1992 if not itag:
1993 continue
1994 quality = fmt.get('quality')
1995 quality_label = fmt.get('qualityLabel') or quality
1996 formats_spec[itag] = {
1997 'asr': int_or_none(fmt.get('audioSampleRate')),
1998 'filesize': int_or_none(fmt.get('contentLength')),
1999 'format_note': quality_label,
2000 'fps': int_or_none(fmt.get('fps')),
2001 'height': int_or_none(fmt.get('height')),
2002 # bitrate for itag 43 is always 2147483647
2003 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
2004 'width': int_or_none(fmt.get('width')),
2005 }
2006
2007 for fmt in streaming_formats:
2008 if fmt.get('drmFamilies') or fmt.get('drm_families'):
2009 continue
2010 url = url_or_none(fmt.get('url'))
2011
2012 if not url:
2013 cipher = fmt.get('cipher') or fmt.get('signatureCipher')
2014 if not cipher:
2015 continue
2016 url_data = compat_parse_qs(cipher)
2017 url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
2018 if not url:
2019 continue
2020 else:
2021 cipher = None
2022 url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2023
2024 stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
2025 # Unsupported FORMAT_STREAM_TYPE_OTF
2026 if stream_type == 3:
2027 continue
2028
2029 format_id = fmt.get('itag') or url_data['itag'][0]
2030 if not format_id:
2031 continue
2032 format_id = compat_str(format_id)
2033
2034 if cipher:
2035 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
2036 ASSETS_RE = (
2037 r'<script[^>]+\bsrc=("[^"]+")[^>]+\bname=["\']player_ias/base',
2038 r'"jsUrl"\s*:\s*("[^"]+")',
2039 r'"assets":.+?"js":\s*("[^"]+")')
2040 jsplayer_url_json = self._search_regex(
2041 ASSETS_RE,
2042 embed_webpage if age_gate else video_webpage,
2043 'JS player URL (1)', default=None)
2044 if not jsplayer_url_json and not age_gate:
2045 # We need the embed website after all
2046 if embed_webpage is None:
2047 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
2048 embed_webpage = self._download_webpage(
2049 embed_url, video_id, 'Downloading embed webpage')
2050 jsplayer_url_json = self._search_regex(
2051 ASSETS_RE, embed_webpage, 'JS player URL')
2052
2053 player_url = json.loads(jsplayer_url_json)
2054 if player_url is None:
2055 player_url_json = self._search_regex(
2056 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
2057 video_webpage, 'age gate player URL')
2058 player_url = json.loads(player_url_json)
2059
2060 if 'sig' in url_data:
2061 url += '&signature=' + url_data['sig'][0]
2062 elif 's' in url_data:
2063 encrypted_sig = url_data['s'][0]
2064
2065 if self._downloader.params.get('verbose'):
2066 if player_url is None:
2067 player_desc = 'unknown'
2068 else:
2069 player_type, player_version = self._extract_player_info(player_url)
2070 player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version)
2071 parts_sizes = self._signature_cache_id(encrypted_sig)
2072 self.to_screen('{%s} signature length %s, %s' %
2073 (format_id, parts_sizes, player_desc))
2074
2075 signature = self._decrypt_signature(
2076 encrypted_sig, video_id, player_url, age_gate)
2077 sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
2078 url += '&%s=%s' % (sp, signature)
2079 if 'ratebypass' not in url:
2080 url += '&ratebypass=yes'
2081
2082 dct = {
2083 'format_id': format_id,
2084 'url': url,
2085 'player_url': player_url,
2086 }
2087 if format_id in self._formats:
2088 dct.update(self._formats[format_id])
2089 if format_id in formats_spec:
2090 dct.update(formats_spec[format_id])
2091
2092 # Some itags are not included in DASH manifest thus corresponding formats will
2093 # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).
2094 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
2095 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
2096 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
2097
2098 if width is None:
2099 width = int_or_none(fmt.get('width'))
2100 if height is None:
2101 height = int_or_none(fmt.get('height'))
2102
2103 filesize = int_or_none(url_data.get(
2104 'clen', [None])[0]) or _extract_filesize(url)
2105
2106 quality = url_data.get('quality', [None])[0] or fmt.get('quality')
2107 quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
2108
2109 tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
2110 or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
2111 fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
2112
2113 more_fields = {
2114 'filesize': filesize,
2115 'tbr': tbr,
2116 'width': width,
2117 'height': height,
2118 'fps': fps,
2119 'format_note': quality_label or quality,
2120 }
2121 for key, value in more_fields.items():
2122 if value:
2123 dct[key] = value
2124 type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
2125 if type_:
2126 type_split = type_.split(';')
2127 kind_ext = type_split[0].split('/')
2128 if len(kind_ext) == 2:
2129 kind, _ = kind_ext
2130 dct['ext'] = mimetype2ext(type_split[0])
2131 if kind in ('audio', 'video'):
2132 codecs = None
2133 for mobj in re.finditer(
2134 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
2135 if mobj.group('key') == 'codecs':
2136 codecs = mobj.group('val')
2137 break
2138 if codecs:
2139 dct.update(parse_codecs(codecs))
2140 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
2141 dct['downloader_options'] = {
2142 # Youtube throttles chunks >~10M
2143 'http_chunk_size': 10485760,
2144 }
2145 formats.append(dct)
2146 else:
2147 manifest_url = (
2148 url_or_none(try_get(
2149 player_response,
2150 lambda x: x['streamingData']['hlsManifestUrl'],
2151 compat_str))
2152 or url_or_none(try_get(
2153 video_info, lambda x: x['hlsvp'][0], compat_str)))
2154 if manifest_url:
2155 formats = []
2156 m3u8_formats = self._extract_m3u8_formats(
2157 manifest_url, video_id, 'mp4', fatal=False)
2158 for a_format in m3u8_formats:
2159 itag = self._search_regex(
2160 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
2161 if itag:
2162 a_format['format_id'] = itag
2163 if itag in self._formats:
2164 dct = self._formats[itag].copy()
2165 dct.update(a_format)
2166 a_format = dct
2167 a_format['player_url'] = player_url
2168 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
2169 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
2170 if self._downloader.params.get('youtube_include_hls_manifest', True):
2171 formats.append(a_format)
2172 else:
2173 error_message = extract_unavailable_message()
2174 if not error_message:
2175 reason_list = try_get(
2176 player_response,
2177 lambda x: x['playabilityStatus']['errorScreen']['playerErrorMessageRenderer']['subreason']['runs'],
2178 list) or []
2179 for reason in reason_list:
2180 if not isinstance(reason, dict):
2181 continue
2182 reason_text = try_get(reason, lambda x: x['text'], compat_str)
2183 if reason_text:
2184 if not error_message:
2185 error_message = ''
2186 error_message += reason_text
2187 if error_message:
2188 error_message = clean_html(error_message)
2189 if not error_message:
2190 error_message = clean_html(try_get(
2191 player_response, lambda x: x['playabilityStatus']['reason'],
2192 compat_str))
2193 if not error_message:
2194 error_message = clean_html(
2195 try_get(video_info, lambda x: x['reason'][0], compat_str))
2196 if error_message:
2197 raise ExtractorError(error_message, expected=True)
2198 raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
2199
2200 # uploader
2201 video_uploader = try_get(
2202 video_info, lambda x: x['author'][0],
2203 compat_str) or str_or_none(video_details.get('author'))
2204 if video_uploader:
2205 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
2206 else:
2207 self._downloader.report_warning('unable to extract uploader name')
2208
2209 # uploader_id
2210 video_uploader_id = None
2211 video_uploader_url = None
2212 mobj = re.search(
2213 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
2214 video_webpage)
2215 if mobj is not None:
2216 video_uploader_id = mobj.group('uploader_id')
2217 video_uploader_url = mobj.group('uploader_url')
2218 else:
2219 owner_profile_url = url_or_none(microformat.get('ownerProfileUrl'))
2220 if owner_profile_url:
2221 video_uploader_id = self._search_regex(
2222 r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id',
2223 default=None)
2224 video_uploader_url = owner_profile_url
2225
2226 channel_id = (
2227 str_or_none(video_details.get('channelId'))
2228 or self._html_search_meta(
2229 'channelId', video_webpage, 'channel id', default=None)
2230 or self._search_regex(
2231 r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
2232 video_webpage, 'channel id', default=None, group='id'))
2233 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
2234
2235 thumbnails = []
2236 thumbnails_list = try_get(
2237 video_details, lambda x: x['thumbnail']['thumbnails'], list) or []
2238 for t in thumbnails_list:
2239 if not isinstance(t, dict):
2240 continue
2241 thumbnail_url = url_or_none(t.get('url'))
2242 if not thumbnail_url:
2243 continue
2244 thumbnails.append({
2245 'url': thumbnail_url,
2246 'width': int_or_none(t.get('width')),
2247 'height': int_or_none(t.get('height')),
2248 })
2249
2250 if not thumbnails:
2251 video_thumbnail = None
2252 # We try first to get a high quality image:
2253 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
2254 video_webpage, re.DOTALL)
2255 if m_thumb is not None:
2256 video_thumbnail = m_thumb.group(1)
2257 thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str)
2258 if thumbnail_url:
2259 video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url)
2260 if video_thumbnail:
2261 thumbnails.append({'url': video_thumbnail})
2262
2263 # upload date
2264 upload_date = self._html_search_meta(
2265 'datePublished', video_webpage, 'upload date', default=None)
2266 if not upload_date:
2267 upload_date = self._search_regex(
2268 [r'(?s)id="eow-date.*?>(.*?)</span>',
2269 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
2270 video_webpage, 'upload date', default=None)
2271 if not upload_date:
2272 upload_date = microformat.get('publishDate') or microformat.get('uploadDate')
2273 upload_date = unified_strdate(upload_date)
2274
2275 video_license = self._html_search_regex(
2276 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
2277 video_webpage, 'license', default=None)
2278
2279 m_music = re.search(
2280 r'''(?x)
2281 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
2282 <ul[^>]*>\s*
2283 <li>(?P<title>.+?)
2284 by (?P<creator>.+?)
2285 (?:
2286 \(.+?\)|
2287 <a[^>]*
2288 (?:
2289 \bhref=["\']/red[^>]*>| # drop possible
2290 >\s*Listen ad-free with YouTube Red # YouTube Red ad
2291 )
2292 .*?
2293 )?</li
2294 ''',
2295 video_webpage)
2296 if m_music:
2297 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
2298 video_creator = clean_html(m_music.group('creator'))
2299 else:
2300 video_alt_title = video_creator = None
2301
2302 def extract_meta(field):
2303 return self._html_search_regex(
2304 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
2305 video_webpage, field, default=None)
2306
2307 track = extract_meta('Song')
2308 artist = extract_meta('Artist')
2309 album = extract_meta('Album')
2310
2311 # Youtube Music Auto-generated description
2312 release_date = release_year = None
2313 if video_description:
2314 mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description)
2315 if mobj:
2316 if not track:
2317 track = mobj.group('track').strip()
2318 if not artist:
2319 artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))
2320 if not album:
2321 album = mobj.group('album'.strip())
2322 release_year = mobj.group('release_year')
2323 release_date = mobj.group('release_date')
2324 if release_date:
2325 release_date = release_date.replace('-', '')
2326 if not release_year:
2327 release_year = int(release_date[:4])
2328 if release_year:
2329 release_year = int(release_year)
2330
2331 yt_initial_data = self._extract_yt_initial_data(video_id, video_webpage)
2332 contents = try_get(yt_initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list) or []
2333 for content in contents:
2334 rows = try_get(content, lambda x: x['videoSecondaryInfoRenderer']['metadataRowContainer']['metadataRowContainerRenderer']['rows'], list) or []
2335 multiple_songs = False
2336 for row in rows:
2337 if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True:
2338 multiple_songs = True
2339 break
2340 for row in rows:
2341 mrr = row.get('metadataRowRenderer') or {}
2342 mrr_title = try_get(
2343 mrr, lambda x: x['title']['simpleText'], compat_str)
2344 mrr_contents = try_get(
2345 mrr, lambda x: x['contents'][0], dict) or {}
2346 mrr_contents_text = try_get(mrr_contents, [lambda x: x['simpleText'], lambda x: x['runs'][0]['text']], compat_str)
2347 if not (mrr_title and mrr_contents_text):
2348 continue
2349 if mrr_title == 'License':
2350 video_license = mrr_contents_text
2351 elif not multiple_songs:
2352 if mrr_title == 'Album':
2353 album = mrr_contents_text
2354 elif mrr_title == 'Artist':
2355 artist = mrr_contents_text
2356 elif mrr_title == 'Song':
2357 track = mrr_contents_text
2358
2359 m_episode = re.search(
2360 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
2361 video_webpage)
2362 if m_episode:
2363 series = unescapeHTML(m_episode.group('series'))
2364 season_number = int(m_episode.group('season'))
2365 episode_number = int(m_episode.group('episode'))
2366 else:
2367 series = season_number = episode_number = None
2368
2369 m_cat_container = self._search_regex(
2370 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
2371 video_webpage, 'categories', default=None)
2372 category = None
2373 if m_cat_container:
2374 category = self._html_search_regex(
2375 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
2376 default=None)
2377 if not category:
2378 category = try_get(
2379 microformat, lambda x: x['category'], compat_str)
2380 video_categories = None if category is None else [category]
2381
2382 video_tags = [
2383 unescapeHTML(m.group('content'))
2384 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
2385 if not video_tags:
2386 video_tags = try_get(video_details, lambda x: x['keywords'], list)
2387
2388 def _extract_count(count_name):
2389 return str_to_int(self._search_regex(
2390 (r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' % re.escape(count_name),
2391 r'["\']label["\']\s*:\s*["\']([\d,.]+)\s+%ss["\']' % re.escape(count_name)),
2392 video_webpage, count_name, default=None))
2393
2394 like_count = _extract_count('like')
2395 dislike_count = _extract_count('dislike')
2396
2397 if view_count is None:
2398 view_count = str_to_int(self._search_regex(
2399 r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
2400 'view count', default=None))
2401
2402 average_rating = (
2403 float_or_none(video_details.get('averageRating'))
2404 or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
2405
2406 # subtitles
2407 video_subtitles = self.extract_subtitles(
2408 video_id, video_webpage, has_live_chat_replay)
2409 automatic_captions = self.extract_automatic_captions(video_id, player_response, ytplayer_config)
2410
2411 video_duration = try_get(
2412 video_info, lambda x: int_or_none(x['length_seconds'][0]))
2413 if not video_duration:
2414 video_duration = int_or_none(video_details.get('lengthSeconds'))
2415 if not video_duration:
2416 video_duration = parse_duration(self._html_search_meta(
2417 'duration', video_webpage, 'video duration'))
2418
2419 # Get Subscriber Count of channel
2420 subscriber_count = parse_count(self._search_regex(
2421 r'"text":"([\d\.]+\w?) subscribers"',
2422 video_webpage,
2423 'subscriber count',
2424 default=None
2425 ))
2426
2427 # get xsrf for annotations or comments
2428 get_annotations = self._downloader.params.get('writeannotations', False)
2429 get_comments = self._downloader.params.get('getcomments', False)
2430 if get_annotations or get_comments:
2431 xsrf_token = None
2432 ytcfg = self._extract_ytcfg(video_id, video_webpage)
2433 if ytcfg:
2434 xsrf_token = try_get(ytcfg, lambda x: x['XSRF_TOKEN'], compat_str)
2435 if not xsrf_token:
2436 xsrf_token = self._search_regex(
2437 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>(?:(?!\2).)+)\2',
2438 video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
2439
2440 # annotations
2441 video_annotations = None
2442 if get_annotations:
2443 invideo_url = try_get(
2444 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
2445 if xsrf_token and invideo_url:
2446 xsrf_field_name = None
2447 if ytcfg:
2448 xsrf_field_name = try_get(ytcfg, lambda x: x['XSRF_FIELD_NAME'], compat_str)
2449 if not xsrf_field_name:
2450 xsrf_field_name = self._search_regex(
2451 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
2452 video_webpage, 'xsrf field name',
2453 group='xsrf_field_name', default='session_token')
2454 video_annotations = self._download_webpage(
2455 self._proto_relative_url(invideo_url),
2456 video_id, note='Downloading annotations',
2457 errnote='Unable to download video annotations', fatal=False,
2458 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
2459
2460 chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
2461
2462 # Get comments
2463 # TODO: Refactor and move to seperate function
2464 if get_comments:
2465 expected_video_comment_count = 0
2466 video_comments = []
2467
2468 def find_value(html, key, num_chars=2, separator='"'):
2469 pos_begin = html.find(key) + len(key) + num_chars
2470 pos_end = html.find(separator, pos_begin)
2471 return html[pos_begin: pos_end]
2472
2473 def search_dict(partial, key):
2474 if isinstance(partial, dict):
2475 for k, v in partial.items():
2476 if k == key:
2477 yield v
2478 else:
2479 for o in search_dict(v, key):
2480 yield o
2481 elif isinstance(partial, list):
2482 for i in partial:
2483 for o in search_dict(i, key):
2484 yield o
2485
2486 try:
2487 ncd = next(search_dict(yt_initial_data, 'nextContinuationData'))
2488 continuations = [ncd['continuation']]
2489 # Handle videos where comments have been disabled entirely
2490 except StopIteration:
2491 continuations = []
2492
2493 def get_continuation(continuation, session_token, replies=False):
2494 query = {
2495 'pbj': 1,
2496 'ctoken': continuation,
2497 }
2498 if replies:
2499 query['action_get_comment_replies'] = 1
2500 else:
2501 query['action_get_comments'] = 1
2502
2503 while True:
2504 content, handle = self._download_webpage_handle(
2505 'https://www.youtube.com/comment_service_ajax',
2506 video_id,
2507 note=False,
2508 expected_status=[413],
2509 data=urlencode_postdata({
2510 'session_token': session_token
2511 }),
2512 query=query,
2513 headers={
2514 'Accept': '*/*',
2515 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0',
2516 'X-YouTube-Client-Name': '1',
2517 'X-YouTube-Client-Version': '2.20201202.06.01'
2518 }
2519 )
2520
2521 response_code = handle.getcode()
2522 if (response_code == 200):
2523 return self._parse_json(content, video_id)
2524 if (response_code == 413):
2525 return None
2526 raise ExtractorError('Unexpected HTTP error code: %s' % response_code)
2527
2528 first_continuation = True
2529 while continuations:
2530 continuation, itct = continuations.pop()
2531 comment_response = get_continuation(continuation, xsrf_token)
2532 if not comment_response:
2533 continue
2534 if list(search_dict(comment_response, 'externalErrorMessage')):
2535 raise ExtractorError('Error returned from server: ' + next(search_dict(comment_response, 'externalErrorMessage')))
2536
2537 if 'continuationContents' not in comment_response['response']:
2538 # Something is wrong here. Youtube won't accept this continuation token for some reason and responds with a user satisfaction dialog (error?)
2539 continue
2540 # not sure if this actually helps
2541 if 'xsrf_token' in comment_response:
2542 xsrf_token = comment_response['xsrf_token']
2543
2544 item_section = comment_response['response']['continuationContents']['itemSectionContinuation']
2545 if first_continuation:
2546 expected_video_comment_count = int(item_section['header']['commentsHeaderRenderer']['countText']['runs'][0]['text'].replace(' Comments', '').replace('1 Comment', '1').replace(',', ''))
2547 first_continuation = False
2548 if 'contents' not in item_section:
2549 # continuation returned no comments?
2550 # set an empty array as to not break the for loop
2551 item_section['contents'] = []
2552
2553 for meta_comment in item_section['contents']:
2554 comment = meta_comment['commentThreadRenderer']['comment']['commentRenderer']
2555 video_comments.append({
2556 'id': comment['commentId'],
2557 'text': ''.join([c['text'] for c in comment['contentText']['runs']]),
2558 'time_text': ''.join([c['text'] for c in comment['publishedTimeText']['runs']]),
2559 'author': comment.get('authorText', {}).get('simpleText', ''),
2560 'votes': comment.get('voteCount', {}).get('simpleText', '0'),
2561 'author_thumbnail': comment['authorThumbnail']['thumbnails'][-1]['url'],
2562 'parent': 'root'
2563 })
2564 if 'replies' not in meta_comment['commentThreadRenderer']:
2565 continue
2566
2567 reply_continuations = [rcn['nextContinuationData']['continuation'] for rcn in meta_comment['commentThreadRenderer']['replies']['commentRepliesRenderer']['continuations']]
2568 while reply_continuations:
2569 time.sleep(1)
2570 continuation = reply_continuations.pop()
2571 replies_data = get_continuation(continuation, xsrf_token, True)
2572 if not replies_data or 'continuationContents' not in replies_data[1]['response']:
2573 continue
2574
2575 if self._downloader.params.get('verbose', False):
2576 self.to_screen('[debug] Comments downloaded (chain %s) %s of ~%s' % (comment['commentId'], len(video_comments), expected_video_comment_count))
2577 reply_comment_meta = replies_data[1]['response']['continuationContents']['commentRepliesContinuation']
2578 for reply_meta in replies_data[1]['response']['continuationContents']['commentRepliesContinuation']['contents']:
2579 reply_comment = reply_meta['commentRenderer']
2580 video_comments.append({
2581 'id': reply_comment['commentId'],
2582 'text': ''.join([c['text'] for c in reply_comment['contentText']['runs']]),
2583 'time_text': ''.join([c['text'] for c in reply_comment['publishedTimeText']['runs']]),
2584 'author': reply_comment.get('authorText', {}).get('simpleText', ''),
2585 'votes': reply_comment.get('voteCount', {}).get('simpleText', '0'),
2586 'author_thumbnail': reply_comment['authorThumbnail']['thumbnails'][-1]['url'],
2587 'parent': comment['commentId']
2588 })
2589 if 'continuations' not in reply_comment_meta or len(reply_comment_meta['continuations']) == 0:
2590 continue
2591
2592 reply_continuations += [rcn['nextContinuationData']['continuation'] for rcn in reply_comment_meta['continuations']]
2593
2594 self.to_screen('Comments downloaded %s of ~%s' % (len(video_comments), expected_video_comment_count))
2595
2596 if 'continuations' in item_section:
2597 continuations += [ncd['nextContinuationData']['continuation'] for ncd in item_section['continuations']]
2598 time.sleep(1)
2599
2600 self.to_screen('Total comments downloaded %s of ~%s' % (len(video_comments), expected_video_comment_count))
2601 else:
2602 expected_video_comment_count = None
2603 video_comments = None
2604
2605 # Look for the DASH manifest
2606 if self._downloader.params.get('youtube_include_dash_manifest', True):
2607 dash_mpd_fatal = True
2608 for mpd_url in dash_mpds:
2609 dash_formats = {}
2610 try:
2611 def decrypt_sig(mobj):
2612 s = mobj.group(1)
2613 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2614 return '/signature/%s' % dec_s
2615
2616 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2617
2618 for df in self._extract_mpd_formats(
2619 mpd_url, video_id, fatal=dash_mpd_fatal,
2620 formats_dict=self._formats):
2621 if not df.get('filesize'):
2622 df['filesize'] = _extract_filesize(df['url'])
2623 # Do not overwrite DASH format found in some previous DASH manifest
2624 if df['format_id'] not in dash_formats:
2625 dash_formats[df['format_id']] = df
2626 # Additional DASH manifests may end up in HTTP Error 403 therefore
2627 # allow them to fail without bug report message if we already have
2628 # some DASH manifest succeeded. This is temporary workaround to reduce
2629 # burst of bug reports until we figure out the reason and whether it
2630 # can be fixed at all.
2631 dash_mpd_fatal = False
2632 except (ExtractorError, KeyError) as e:
2633 self.report_warning(
2634 'Skipping DASH manifest: %r' % e, video_id)
2635 if dash_formats:
2636 # Remove the formats we found through non-DASH, they
2637 # contain less info and it can be wrong, because we use
2638 # fixed values (for example the resolution). See
2639 # https://github.com/ytdl-org/youtube-dl/issues/5774 for an
2640 # example.
2641 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
2642 formats.extend(dash_formats.values())
2643
2644 # Check for malformed aspect ratio
2645 stretched_m = re.search(
2646 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2647 video_webpage)
2648 if stretched_m:
2649 w = float(stretched_m.group('w'))
2650 h = float(stretched_m.group('h'))
2651 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2652 # We will only process correct ratios.
2653 if w > 0 and h > 0:
2654 ratio = w / h
2655 for f in formats:
2656 if f.get('vcodec') != 'none':
2657 f['stretched_ratio'] = ratio
2658
2659 if not formats:
2660 if 'reason' in video_info:
2661 if 'The uploader has not made this video available in your country.' in video_info['reason']:
2662 regions_allowed = self._html_search_meta(
2663 'regionsAllowed', video_webpage, default=None)
2664 countries = regions_allowed.split(',') if regions_allowed else None
2665 self.raise_geo_restricted(
2666 msg=video_info['reason'][0], countries=countries)
2667 reason = video_info['reason'][0]
2668 if 'Invalid parameters' in reason:
2669 unavailable_message = extract_unavailable_message()
2670 if unavailable_message:
2671 reason = unavailable_message
2672 raise ExtractorError(
2673 'YouTube said: %s' % reason,
2674 expected=True, video_id=video_id)
2675 if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']):
2676 raise ExtractorError('This video is DRM protected.', expected=True)
2677
2678 self._sort_formats(formats)
2679
2680 self.mark_watched(video_id, video_info, player_response)
2681
2682 return {
2683 'id': video_id,
2684 'uploader': video_uploader,
2685 'uploader_id': video_uploader_id,
2686 'uploader_url': video_uploader_url,
2687 'channel_id': channel_id,
2688 'channel_url': channel_url,
2689 'upload_date': upload_date,
2690 'license': video_license,
2691 'creator': video_creator or artist,
2692 'title': video_title,
2693 'alt_title': video_alt_title or track,
2694 'thumbnails': thumbnails,
2695 'description': video_description,
2696 'categories': video_categories,
2697 'tags': video_tags,
2698 'subtitles': video_subtitles,
2699 'automatic_captions': automatic_captions,
2700 'duration': video_duration,
2701 'age_limit': 18 if age_gate else 0,
2702 'annotations': video_annotations,
2703 'chapters': chapters,
2704 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
2705 'view_count': view_count,
2706 'like_count': like_count,
2707 'dislike_count': dislike_count,
2708 'average_rating': average_rating,
2709 'formats': formats,
2710 'is_live': is_live,
2711 'start_time': start_time,
2712 'end_time': end_time,
2713 'series': series,
2714 'season_number': season_number,
2715 'episode_number': episode_number,
2716 'track': track,
2717 'artist': artist,
2718 'album': album,
2719 'release_date': release_date,
2720 'release_year': release_year,
2721 'subscriber_count': subscriber_count,
2722 'playable_in_embed': playable_in_embed,
2723 'comments': video_comments,
2724 'comment_count': expected_video_comment_count,
2725 }
2726
2727
2728 class YoutubeTabIE(YoutubeBaseInfoExtractor):
2729 IE_DESC = 'YouTube.com tab'
2730 _VALID_URL = r'''(?x)
2731 https?://
2732 (?:\w+\.)?
2733 (?:
2734 youtube(?:kids)?\.com|
2735 invidio\.us
2736 )/
2737 (?:
2738 (?:channel|c|user)/|
2739 (?P<not_channel>
2740 feed/|
2741 (?:playlist|watch)\?.*?\blist=
2742 )|
2743 (?!(?:%s)\b) # Direct URLs
2744 )
2745 (?P<id>[^/?\#&]+)
2746 ''' % YoutubeBaseInfoExtractor._RESERVED_NAMES
2747 IE_NAME = 'youtube:tab'
2748
2749 _TESTS = [{
2750 # playlists, multipage
2751 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid',
2752 'playlist_mincount': 94,
2753 'info_dict': {
2754 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2755 'title': 'Игорь Клейнер - Playlists',
2756 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
2757 },
2758 }, {
2759 # playlists, multipage, different order
2760 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
2761 'playlist_mincount': 94,
2762 'info_dict': {
2763 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2764 'title': 'Игорь Клейнер - Playlists',
2765 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
2766 },
2767 }, {
2768 # playlists, singlepage
2769 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
2770 'playlist_mincount': 4,
2771 'info_dict': {
2772 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
2773 'title': 'ThirstForScience - Playlists',
2774 'description': 'md5:609399d937ea957b0f53cbffb747a14c',
2775 }
2776 }, {
2777 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
2778 'only_matching': True,
2779 }, {
2780 # basic, single video playlist
2781 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2782 'info_dict': {
2783 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2784 'uploader': 'Sergey M.',
2785 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2786 'title': 'youtube-dl public playlist',
2787 },
2788 'playlist_count': 1,
2789 }, {
2790 # empty playlist
2791 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2792 'info_dict': {
2793 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2794 'uploader': 'Sergey M.',
2795 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2796 'title': 'youtube-dl empty playlist',
2797 },
2798 'playlist_count': 0,
2799 }, {
2800 # Home tab
2801 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured',
2802 'info_dict': {
2803 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2804 'title': 'lex will - Home',
2805 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2806 },
2807 'playlist_mincount': 2,
2808 }, {
2809 # Videos tab
2810 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos',
2811 'info_dict': {
2812 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2813 'title': 'lex will - Videos',
2814 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2815 },
2816 'playlist_mincount': 975,
2817 }, {
2818 # Videos tab, sorted by popular
2819 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid',
2820 'info_dict': {
2821 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2822 'title': 'lex will - Videos',
2823 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2824 },
2825 'playlist_mincount': 199,
2826 }, {
2827 # Playlists tab
2828 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists',
2829 'info_dict': {
2830 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2831 'title': 'lex will - Playlists',
2832 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2833 },
2834 'playlist_mincount': 17,
2835 }, {
2836 # Community tab
2837 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community',
2838 'info_dict': {
2839 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2840 'title': 'lex will - Community',
2841 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2842 },
2843 'playlist_mincount': 18,
2844 }, {
2845 # Channels tab
2846 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels',
2847 'info_dict': {
2848 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2849 'title': 'lex will - Channels',
2850 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2851 },
2852 'playlist_mincount': 138,
2853 }, {
2854 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA',
2855 'only_matching': True,
2856 }, {
2857 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
2858 'only_matching': True,
2859 }, {
2860 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
2861 'only_matching': True,
2862 }, {
2863 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2864 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2865 'info_dict': {
2866 'title': '29C3: Not my department',
2867 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2868 'uploader': 'Christiaan008',
2869 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
2870 },
2871 'playlist_count': 96,
2872 }, {
2873 'note': 'Large playlist',
2874 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
2875 'info_dict': {
2876 'title': 'Uploads from Cauchemar',
2877 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
2878 'uploader': 'Cauchemar',
2879 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
2880 },
2881 'playlist_mincount': 1123,
2882 }, {
2883 # even larger playlist, 8832 videos
2884 'url': 'http://www.youtube.com/user/NASAgovVideo/videos',
2885 'only_matching': True,
2886 }, {
2887 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2888 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2889 'info_dict': {
2890 'title': 'Uploads from Interstellar Movie',
2891 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
2892 'uploader': 'Interstellar Movie',
2893 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
2894 },
2895 'playlist_mincount': 21,
2896 }, {
2897 # https://github.com/ytdl-org/youtube-dl/issues/21844
2898 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2899 'info_dict': {
2900 'title': 'Data Analysis with Dr Mike Pound',
2901 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2902 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
2903 'uploader': 'Computerphile',
2904 },
2905 'playlist_mincount': 11,
2906 }, {
2907 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2908 'only_matching': True,
2909 }, {
2910 # Playlist URL that does not actually serve a playlist
2911 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2912 'info_dict': {
2913 'id': 'FqZTN594JQw',
2914 'ext': 'webm',
2915 'title': "Smiley's People 01 detective, Adventure Series, Action",
2916 'uploader': 'STREEM',
2917 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
2918 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
2919 'upload_date': '20150526',
2920 'license': 'Standard YouTube License',
2921 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2922 'categories': ['People & Blogs'],
2923 'tags': list,
2924 'view_count': int,
2925 'like_count': int,
2926 'dislike_count': int,
2927 },
2928 'params': {
2929 'skip_download': True,
2930 },
2931 'skip': 'This video is not available.',
2932 'add_ie': [YoutubeIE.ie_key()],
2933 }, {
2934 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
2935 'only_matching': True,
2936 }, {
2937 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
2938 'only_matching': True,
2939 }, {
2940 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live',
2941 'info_dict': {
2942 'id': '9Auq9mYxFEE',
2943 'ext': 'mp4',
2944 'title': 'Watch Sky News live',
2945 'uploader': 'Sky News',
2946 'uploader_id': 'skynews',
2947 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews',
2948 'upload_date': '20191102',
2949 'description': 'md5:78de4e1c2359d0ea3ed829678e38b662',
2950 'categories': ['News & Politics'],
2951 'tags': list,
2952 'like_count': int,
2953 'dislike_count': int,
2954 },
2955 'params': {
2956 'skip_download': True,
2957 },
2958 }, {
2959 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
2960 'info_dict': {
2961 'id': 'a48o2S1cPoo',
2962 'ext': 'mp4',
2963 'title': 'The Young Turks - Live Main Show',
2964 'uploader': 'The Young Turks',
2965 'uploader_id': 'TheYoungTurks',
2966 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
2967 'upload_date': '20150715',
2968 'license': 'Standard YouTube License',
2969 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
2970 'categories': ['News & Politics'],
2971 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
2972 'like_count': int,
2973 'dislike_count': int,
2974 },
2975 'params': {
2976 'skip_download': True,
2977 },
2978 'only_matching': True,
2979 }, {
2980 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
2981 'only_matching': True,
2982 }, {
2983 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
2984 'only_matching': True,
2985 }, {
2986 'url': 'https://www.youtube.com/feed/trending',
2987 'only_matching': True,
2988 }, {
2989 # needs auth
2990 'url': 'https://www.youtube.com/feed/library',
2991 'only_matching': True,
2992 }, {
2993 # needs auth
2994 'url': 'https://www.youtube.com/feed/history',
2995 'only_matching': True,
2996 }, {
2997 # needs auth
2998 'url': 'https://www.youtube.com/feed/subscriptions',
2999 'only_matching': True,
3000 }, {
3001 # needs auth
3002 'url': 'https://www.youtube.com/feed/watch_later',
3003 'only_matching': True,
3004 }, {
3005 # no longer available?
3006 'url': 'https://www.youtube.com/feed/recommended',
3007 'only_matching': True,
3008 }, {
3009 # inline playlist with not always working continuations
3010 'url': 'https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C',
3011 'only_matching': True,
3012 }, {
3013 'url': 'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8',
3014 'only_matching': True,
3015 }, {
3016 'url': 'https://www.youtube.com/course',
3017 'only_matching': True,
3018 }, {
3019 'url': 'https://www.youtube.com/zsecurity',
3020 'only_matching': True,
3021 }, {
3022 'url': 'http://www.youtube.com/NASAgovVideo/videos',
3023 'only_matching': True,
3024 }, {
3025 'url': 'https://www.youtube.com/TheYoungTurks/live',
3026 'only_matching': True,
3027 }]
3028
3029 @classmethod
3030 def suitable(cls, url):
3031 return False if YoutubeIE.suitable(url) else super(
3032 YoutubeTabIE, cls).suitable(url)
3033
3034 def _extract_channel_id(self, webpage):
3035 channel_id = self._html_search_meta(
3036 'channelId', webpage, 'channel id', default=None)
3037 if channel_id:
3038 return channel_id
3039 channel_url = self._html_search_meta(
3040 ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url',
3041 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad',
3042 'twitter:app:url:googleplay'), webpage, 'channel url')
3043 return self._search_regex(
3044 r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+',
3045 channel_url, 'channel id')
3046
3047 @staticmethod
3048 def _extract_grid_item_renderer(item):
3049 for item_kind in ('Playlist', 'Video', 'Channel'):
3050 renderer = item.get('grid%sRenderer' % item_kind)
3051 if renderer:
3052 return renderer
3053
3054 def _grid_entries(self, grid_renderer):
3055 for item in grid_renderer['items']:
3056 if not isinstance(item, dict):
3057 continue
3058 renderer = self._extract_grid_item_renderer(item)
3059 if not isinstance(renderer, dict):
3060 continue
3061 title = try_get(
3062 renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
3063 # playlist
3064 playlist_id = renderer.get('playlistId')
3065 if playlist_id:
3066 yield self.url_result(
3067 'https://www.youtube.com/playlist?list=%s' % playlist_id,
3068 ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
3069 video_title=title)
3070 # video
3071 video_id = renderer.get('videoId')
3072 if video_id:
3073 yield self._extract_video(renderer)
3074 # channel
3075 channel_id = renderer.get('channelId')
3076 if channel_id:
3077 title = try_get(
3078 renderer, lambda x: x['title']['simpleText'], compat_str)
3079 yield self.url_result(
3080 'https://www.youtube.com/channel/%s' % channel_id,
3081 ie=YoutubeTabIE.ie_key(), video_title=title)
3082
3083 def _shelf_entries_from_content(self, shelf_renderer):
3084 content = shelf_renderer.get('content')
3085 if not isinstance(content, dict):
3086 return
3087 renderer = content.get('gridRenderer')
3088 if renderer:
3089 # TODO: add support for nested playlists so each shelf is processed
3090 # as separate playlist
3091 # TODO: this includes only first N items
3092 for entry in self._grid_entries(renderer):
3093 yield entry
3094 renderer = content.get('horizontalListRenderer')
3095 if renderer:
3096 # TODO
3097 pass
3098
3099 def _shelf_entries(self, shelf_renderer, skip_channels=False):
3100 ep = try_get(
3101 shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
3102 compat_str)
3103 shelf_url = urljoin('https://www.youtube.com', ep)
3104 if shelf_url:
3105 # Skipping links to another channels, note that checking for
3106 # endpoint.commandMetadata.webCommandMetadata.webPageTypwebPageType == WEB_PAGE_TYPE_CHANNEL
3107 # will not work
3108 if skip_channels and '/channels?' in shelf_url:
3109 return
3110 title = try_get(
3111 shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
3112 yield self.url_result(shelf_url, video_title=title)
3113 # Shelf may not contain shelf URL, fallback to extraction from content
3114 for entry in self._shelf_entries_from_content(shelf_renderer):
3115 yield entry
3116
3117 def _playlist_entries(self, video_list_renderer):
3118 for content in video_list_renderer['contents']:
3119 if not isinstance(content, dict):
3120 continue
3121 renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer')
3122 if not isinstance(renderer, dict):
3123 continue
3124 video_id = renderer.get('videoId')
3125 if not video_id:
3126 continue
3127 yield self._extract_video(renderer)
3128
3129 r""" # Not needed in the new implementation
3130 def _itemSection_entries(self, item_sect_renderer):
3131 for content in item_sect_renderer['contents']:
3132 if not isinstance(content, dict):
3133 continue
3134 renderer = content.get('videoRenderer', {})
3135 if not isinstance(renderer, dict):
3136 continue
3137 video_id = renderer.get('videoId')
3138 if not video_id:
3139 continue
3140 yield self._extract_video(renderer)
3141 """
3142
3143 def _rich_entries(self, rich_grid_renderer):
3144 renderer = try_get(
3145 rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) or {}
3146 video_id = renderer.get('videoId')
3147 if not video_id:
3148 return
3149 yield self._extract_video(renderer)
3150
3151 def _video_entry(self, video_renderer):
3152 video_id = video_renderer.get('videoId')
3153 if video_id:
3154 return self._extract_video(video_renderer)
3155
3156 def _post_thread_entries(self, post_thread_renderer):
3157 post_renderer = try_get(
3158 post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict)
3159 if not post_renderer:
3160 return
3161 # video attachment
3162 video_renderer = try_get(
3163 post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict)
3164 video_id = None
3165 if video_renderer:
3166 entry = self._video_entry(video_renderer)
3167 if entry:
3168 yield entry
3169 # inline video links
3170 runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or []
3171 for run in runs:
3172 if not isinstance(run, dict):
3173 continue
3174 ep_url = try_get(
3175 run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str)
3176 if not ep_url:
3177 continue
3178 if not YoutubeIE.suitable(ep_url):
3179 continue
3180 ep_video_id = YoutubeIE._match_id(ep_url)
3181 if video_id == ep_video_id:
3182 continue
3183 yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=video_id)
3184
3185 def _post_thread_continuation_entries(self, post_thread_continuation):
3186 contents = post_thread_continuation.get('contents')
3187 if not isinstance(contents, list):
3188 return
3189 for content in contents:
3190 renderer = content.get('backstagePostThreadRenderer')
3191 if not isinstance(renderer, dict):
3192 continue
3193 for entry in self._post_thread_entries(renderer):
3194 yield entry
3195
3196 @staticmethod
3197 def _build_continuation_query(continuation, ctp=None):
3198 query = {
3199 'ctoken': continuation,
3200 'continuation': continuation,
3201 }
3202 if ctp:
3203 query['itct'] = ctp
3204 return query
3205
3206 @staticmethod
3207 def _extract_next_continuation_data(renderer):
3208 next_continuation = try_get(
3209 renderer, lambda x: x['continuations'][0]['nextContinuationData'], dict)
3210 if not next_continuation:
3211 return
3212 continuation = next_continuation.get('continuation')
3213 if not continuation:
3214 return
3215 ctp = next_continuation.get('clickTrackingParams')
3216 return YoutubeTabIE._build_continuation_query(continuation, ctp)
3217
3218 @classmethod
3219 def _extract_continuation(cls, renderer):
3220 next_continuation = cls._extract_next_continuation_data(renderer)
3221 if next_continuation:
3222 return next_continuation
3223 contents = renderer.get('contents')
3224 if not isinstance(contents, list):
3225 return
3226 for content in contents:
3227 if not isinstance(content, dict):
3228 continue
3229 continuation_ep = try_get(
3230 content, lambda x: x['continuationItemRenderer']['continuationEndpoint'],
3231 dict)
3232 if not continuation_ep:
3233 continue
3234 continuation = try_get(
3235 continuation_ep, lambda x: x['continuationCommand']['token'], compat_str)
3236 if not continuation:
3237 continue
3238 ctp = continuation_ep.get('clickTrackingParams')
3239 return YoutubeTabIE._build_continuation_query(continuation, ctp)
3240
3241 def _entries(self, tab, identity_token):
3242
3243 def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds
3244 contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
3245 for content in contents:
3246 if not isinstance(content, dict):
3247 continue
3248 is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict)
3249 if not is_renderer:
3250 renderer = content.get('richItemRenderer')
3251 if renderer:
3252 for entry in self._rich_entries(renderer):
3253 yield entry
3254 continuation_list[0] = self._extract_continuation(parent_renderer)
3255 continue
3256 isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
3257 for isr_content in isr_contents:
3258 if not isinstance(isr_content, dict):
3259 continue
3260 renderer = isr_content.get('playlistVideoListRenderer')
3261 if renderer:
3262 for entry in self._playlist_entries(renderer):
3263 yield entry
3264 continuation_list[0] = self._extract_continuation(renderer)
3265 continue
3266 renderer = isr_content.get('gridRenderer')
3267 if renderer:
3268 for entry in self._grid_entries(renderer):
3269 yield entry
3270 continuation_list[0] = self._extract_continuation(renderer)
3271 continue
3272 renderer = isr_content.get('shelfRenderer')
3273 if renderer:
3274 is_channels_tab = tab.get('title') == 'Channels'
3275 for entry in self._shelf_entries(renderer, not is_channels_tab):
3276 yield entry
3277 continue
3278 renderer = isr_content.get('backstagePostThreadRenderer')
3279 if renderer:
3280 for entry in self._post_thread_entries(renderer):
3281 yield entry
3282 continuation_list[0] = self._extract_continuation(renderer)
3283 continue
3284 renderer = isr_content.get('videoRenderer')
3285 if renderer:
3286 entry = self._video_entry(renderer)
3287 if entry:
3288 yield entry
3289
3290 if not continuation_list[0]:
3291 continuation_list[0] = self._extract_continuation(is_renderer)
3292
3293 if not continuation_list[0]:
3294 continuation_list[0] = self._extract_continuation(parent_renderer)
3295
3296 continuation_list = [None] # Python 2 doesnot support nonlocal
3297 tab_content = try_get(tab, lambda x: x['content'], dict)
3298 if not tab_content:
3299 return
3300 parent_renderer = (
3301 try_get(tab_content, lambda x: x['sectionListRenderer'], dict)
3302 or try_get(tab_content, lambda x: x['richGridRenderer'], dict) or {})
3303 for entry in extract_entries(parent_renderer):
3304 yield entry
3305 continuation = continuation_list[0]
3306
3307 headers = {
3308 'x-youtube-client-name': '1',
3309 'x-youtube-client-version': '2.20201112.04.01',
3310 }
3311 if identity_token:
3312 headers['x-youtube-identity-token'] = identity_token
3313
3314 for page_num in itertools.count(1):
3315 if not continuation:
3316 break
3317 count = 0
3318 retries = 3
3319 while count <= retries:
3320 try:
3321 # Downloading page may result in intermittent 5xx HTTP error
3322 # that is usually worked around with a retry
3323 browse = self._download_json(
3324 'https://www.youtube.com/browse_ajax', None,
3325 'Downloading page %d%s'
3326 % (page_num, ' (retry #%d)' % count if count else ''),
3327 headers=headers, query=continuation)
3328 break
3329 except ExtractorError as e:
3330 if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
3331 count += 1
3332 if count <= retries:
3333 continue
3334 raise
3335 if not browse:
3336 break
3337 response = try_get(browse, lambda x: x[1]['response'], dict)
3338 if not response:
3339 break
3340
3341 continuation_contents = try_get(
3342 response, lambda x: x['continuationContents'], dict)
3343 if continuation_contents:
3344 continuation_renderer = continuation_contents.get('playlistVideoListContinuation')
3345 if continuation_renderer:
3346 for entry in self._playlist_entries(continuation_renderer):
3347 yield entry
3348 continuation = self._extract_continuation(continuation_renderer)
3349 continue
3350 continuation_renderer = continuation_contents.get('gridContinuation')
3351 if continuation_renderer:
3352 for entry in self._grid_entries(continuation_renderer):
3353 yield entry
3354 continuation = self._extract_continuation(continuation_renderer)
3355 continue
3356 continuation_renderer = continuation_contents.get('itemSectionContinuation')
3357 if continuation_renderer:
3358 for entry in self._post_thread_continuation_entries(continuation_renderer):
3359 yield entry
3360 continuation = self._extract_continuation(continuation_renderer)
3361 continue
3362 continuation_renderer = continuation_contents.get('sectionListContinuation') # for feeds
3363 if continuation_renderer:
3364 continuation_list = [None]
3365 for entry in extract_entries(continuation_renderer):
3366 yield entry
3367 continuation = continuation_list[0]
3368 continue
3369
3370 continuation_items = try_get(
3371 response, lambda x: x['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems'], list)
3372 if continuation_items:
3373 continuation_item = continuation_items[0]
3374 if not isinstance(continuation_item, dict):
3375 continue
3376 renderer = continuation_item.get('playlistVideoRenderer') or continuation_item.get('itemSectionRenderer')
3377 if renderer:
3378 video_list_renderer = {'contents': continuation_items}
3379 for entry in self._playlist_entries(video_list_renderer):
3380 yield entry
3381 continuation = self._extract_continuation(video_list_renderer)
3382 continue
3383 break
3384
3385 @staticmethod
3386 def _extract_selected_tab(tabs):
3387 for tab in tabs:
3388 if try_get(tab, lambda x: x['tabRenderer']['selected'], bool):
3389 return tab['tabRenderer']
3390 else:
3391 raise ExtractorError('Unable to find selected tab')
3392
3393 @staticmethod
3394 def _extract_uploader(data):
3395 uploader = {}
3396 sidebar_renderer = try_get(
3397 data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list)
3398 if sidebar_renderer:
3399 for item in sidebar_renderer:
3400 if not isinstance(item, dict):
3401 continue
3402 renderer = item.get('playlistSidebarSecondaryInfoRenderer')
3403 if not isinstance(renderer, dict):
3404 continue
3405 owner = try_get(
3406 renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict)
3407 if owner:
3408 uploader['uploader'] = owner.get('text')
3409 uploader['uploader_id'] = try_get(
3410 owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str)
3411 uploader['uploader_url'] = urljoin(
3412 'https://www.youtube.com/',
3413 try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
3414 return uploader
3415
3416 def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
3417 selected_tab = self._extract_selected_tab(tabs)
3418 renderer = try_get(
3419 data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
3420 playlist_id = title = description = None
3421 if renderer:
3422 channel_title = renderer.get('title') or item_id
3423 tab_title = selected_tab.get('title')
3424 title = channel_title or item_id
3425 if tab_title:
3426 title += ' - %s' % tab_title
3427 description = renderer.get('description')
3428 playlist_id = renderer.get('externalId')
3429
3430 # this has thumbnails, but there is currently no thumbnail field for playlists
3431 # sidebar.playlistSidebarRenderer has even more data, but its stucture is more complec
3432 renderer = try_get(
3433 data, lambda x: x['microformat']['microformatDataRenderer'], dict)
3434 if not renderer:
3435 renderer = try_get(
3436 data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
3437 if renderer:
3438 title = renderer.get('title')
3439 description = renderer.get('description')
3440 playlist_id = item_id
3441
3442 if playlist_id is None:
3443 playlist_id = item_id
3444 if title is None:
3445 title = "Youtube " + playlist_id.title()
3446 playlist = self.playlist_result(
3447 self._entries(selected_tab, identity_token),
3448 playlist_id=playlist_id, playlist_title=title,
3449 playlist_description=description)
3450 playlist.update(self._extract_uploader(data))
3451 return playlist
3452
3453 def _extract_from_playlist(self, item_id, url, data, playlist):
3454 title = playlist.get('title') or try_get(
3455 data, lambda x: x['titleText']['simpleText'], compat_str)
3456 playlist_id = playlist.get('playlistId') or item_id
3457 # Inline playlist rendition continuation does not always work
3458 # at Youtube side, so delegating regular tab-based playlist URL
3459 # processing whenever possible.
3460 playlist_url = urljoin(url, try_get(
3461 playlist, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
3462 compat_str))
3463 if playlist_url and playlist_url != url:
3464 return self.url_result(
3465 playlist_url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
3466 video_title=title)
3467 return self.playlist_result(
3468 self._playlist_entries(playlist), playlist_id=playlist_id,
3469 playlist_title=title)
3470
3471 @staticmethod
3472 def _extract_alerts(data):
3473 for alert_dict in try_get(data, lambda x: x['alerts'], list) or []:
3474 if not isinstance(alert_dict, dict):
3475 continue
3476 for renderer in alert_dict:
3477 alert = alert_dict[renderer]
3478 alert_type = alert.get('type')
3479 if not alert_type:
3480 continue
3481 message = try_get(alert, lambda x: x['text']['simpleText'], compat_str)
3482 if message:
3483 yield alert_type, message
3484 for run in try_get(alert, lambda x: x['text']['runs'], list) or []:
3485 message = try_get(run, lambda x: x['text'], compat_str)
3486 if message:
3487 yield alert_type, message
3488
3489 def _extract_identity_token(self, webpage, item_id):
3490 ytcfg = self._extract_ytcfg(item_id, webpage)
3491 if ytcfg:
3492 token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str)
3493 if token:
3494 return token
3495 return self._search_regex(
3496 r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
3497 'identity token', default=None)
3498
3499 def _real_extract(self, url):
3500 item_id = self._match_id(url)
3501 url = compat_urlparse.urlunparse(
3502 compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
3503 is_home = re.match(r'(?P<pre>%s)(?P<post>/?(?![^#?]).*$)' % self._VALID_URL, url)
3504 if is_home is not None and is_home.group('not_channel') is None and item_id != 'feed':
3505 self._downloader.report_warning(
3506 'A channel/user page was given. All the channel\'s videos will be downloaded. '
3507 'To download only the videos in the home page, add a "/featured" to the URL')
3508 url = '%s/videos%s' % (is_home.group('pre'), is_home.group('post') or '')
3509
3510 # Handle both video/playlist URLs
3511 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3512 video_id = qs.get('v', [None])[0]
3513 playlist_id = qs.get('list', [None])[0]
3514
3515 if is_home is not None and is_home.group('not_channel') is not None and is_home.group('not_channel').startswith('watch') and not video_id:
3516 if playlist_id:
3517 self._downloader.report_warning('%s is not a valid Youtube URL. Trying to download playlist %s' % (url, playlist_id))
3518 url = 'https://www.youtube.com/playlist?list=%s' % playlist_id
3519 # return self.url_result(playlist_id, ie=YoutubePlaylistIE.ie_key())
3520 else:
3521 raise ExtractorError('Unable to recognize tab page')
3522 if video_id and playlist_id:
3523 if self._downloader.params.get('noplaylist'):
3524 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
3525 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
3526 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
3527
3528 webpage = self._download_webpage(url, item_id)
3529 identity_token = self._extract_identity_token(webpage, item_id)
3530 data = self._extract_yt_initial_data(item_id, webpage)
3531 err_msg = None
3532 for alert_type, alert_message in self._extract_alerts(data):
3533 if alert_type.lower() == 'error':
3534 if err_msg:
3535 self._downloader.report_warning('YouTube said: %s - %s' % ('ERROR', err_msg))
3536 err_msg = alert_message
3537 else:
3538 self._downloader.report_warning('YouTube said: %s - %s' % (alert_type, alert_message))
3539 if err_msg:
3540 raise ExtractorError('YouTube said: %s' % err_msg, expected=True)
3541 tabs = try_get(
3542 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
3543 if tabs:
3544 return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token)
3545 playlist = try_get(
3546 data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
3547 if playlist:
3548 return self._extract_from_playlist(item_id, url, data, playlist)
3549 # Fallback to video extraction if no playlist alike page is recognized.
3550 # First check for the current video then try the v attribute of URL query.
3551 video_id = try_get(
3552 data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'],
3553 compat_str) or video_id
3554 if video_id:
3555 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
3556 # Failed to recognize
3557 raise ExtractorError('Unable to recognize tab page')
3558
3559
3560 class YoutubePlaylistIE(InfoExtractor):
3561 IE_DESC = 'YouTube.com playlists'
3562 _VALID_URL = r'''(?x)(?:
3563 (?:https?://)?
3564 (?:\w+\.)?
3565 (?:
3566 (?:
3567 youtube(?:kids)?\.com|
3568 invidio\.us
3569 )
3570 /.*?\?.*?\blist=
3571 )?
3572 (?P<id>%(playlist_id)s)
3573 )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
3574 IE_NAME = 'youtube:playlist'
3575 _TESTS = [{
3576 'note': 'issue #673',
3577 'url': 'PLBB231211A4F62143',
3578 'info_dict': {
3579 'title': '[OLD]Team Fortress 2 (Class-based LP)',
3580 'id': 'PLBB231211A4F62143',
3581 'uploader': 'Wickydoo',
3582 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
3583 },
3584 'playlist_mincount': 29,
3585 }, {
3586 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
3587 'info_dict': {
3588 'title': 'YDL_safe_search',
3589 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
3590 },
3591 'playlist_count': 2,
3592 'skip': 'This playlist is private',
3593 }, {
3594 'note': 'embedded',
3595 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
3596 'playlist_count': 4,
3597 'info_dict': {
3598 'title': 'JODA15',
3599 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
3600 'uploader': 'milan',
3601 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
3602 }
3603 }, {
3604 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
3605 'playlist_mincount': 982,
3606 'info_dict': {
3607 'title': '2018 Chinese New Singles (11/6 updated)',
3608 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
3609 'uploader': 'LBK',
3610 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
3611 }
3612 }, {
3613 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
3614 'only_matching': True,
3615 }, {
3616 # music album playlist
3617 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
3618 'only_matching': True,
3619 }]
3620
3621 @classmethod
3622 def suitable(cls, url):
3623 return False if YoutubeTabIE.suitable(url) else super(
3624 YoutubePlaylistIE, cls).suitable(url)
3625
3626 def _real_extract(self, url):
3627 playlist_id = self._match_id(url)
3628 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3629 if not qs:
3630 qs = {'list': playlist_id}
3631 return self.url_result(
3632 update_url_query('https://www.youtube.com/playlist', qs),
3633 ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
3634
3635
3636 class YoutubeYtBeIE(InfoExtractor):
3637 IE_DESC = 'youtu.be'
3638 _VALID_URL = r'https?://youtu\.be/(?P<id>[0-9A-Za-z_-]{11})/*?.*?\blist=(?P<playlist_id>%(playlist_id)s)' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
3639 _TESTS = [{
3640 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
3641 'info_dict': {
3642 'id': 'yeWKywCrFtk',
3643 'ext': 'mp4',
3644 'title': 'Small Scale Baler and Braiding Rugs',
3645 'uploader': 'Backus-Page House Museum',
3646 'uploader_id': 'backuspagemuseum',
3647 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
3648 'upload_date': '20161008',
3649 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
3650 'categories': ['Nonprofits & Activism'],
3651 'tags': list,
3652 'like_count': int,
3653 'dislike_count': int,
3654 },
3655 'params': {
3656 'noplaylist': True,
3657 'skip_download': True,
3658 },
3659 }, {
3660 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
3661 'only_matching': True,
3662 }]
3663
3664 def _real_extract(self, url):
3665 mobj = re.match(self._VALID_URL, url)
3666 video_id = mobj.group('id')
3667 playlist_id = mobj.group('playlist_id')
3668 return self.url_result(
3669 update_url_query('https://www.youtube.com/watch', {
3670 'v': video_id,
3671 'list': playlist_id,
3672 'feature': 'youtu.be',
3673 }), ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
3674
3675
3676 class YoutubeYtUserIE(InfoExtractor):
3677 IE_DESC = 'YouTube.com user videos, URL or "ytuser" keyword'
3678 _VALID_URL = r'ytuser:(?P<id>.+)'
3679 _TESTS = [{
3680 'url': 'ytuser:phihag',
3681 'only_matching': True,
3682 }]
3683
3684 def _real_extract(self, url):
3685 user_id = self._match_id(url)
3686 return self.url_result(
3687 'https://www.youtube.com/user/%s' % user_id,
3688 ie=YoutubeTabIE.ie_key(), video_id=user_id)
3689
3690
3691 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
3692 IE_NAME = 'youtube:favorites'
3693 IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)'
3694 _VALID_URL = r':ytfav(?:ou?rite)?s?'
3695 _LOGIN_REQUIRED = True
3696 _TESTS = [{
3697 'url': ':ytfav',
3698 'only_matching': True,
3699 }, {
3700 'url': ':ytfavorites',
3701 'only_matching': True,
3702 }]
3703
3704 def _real_extract(self, url):
3705 return self.url_result(
3706 'https://www.youtube.com/playlist?list=LL',
3707 ie=YoutubeTabIE.ie_key())
3708
3709
3710 class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):
3711 IE_DESC = 'YouTube.com searches'
3712 # there doesn't appear to be a real limit, for example if you search for
3713 # 'python' you get more than 8.000.000 results
3714 _MAX_RESULTS = float('inf')
3715 IE_NAME = 'youtube:search'
3716 _SEARCH_KEY = 'ytsearch'
3717 _SEARCH_PARAMS = None
3718 _TESTS = []
3719
3720 def _entries(self, query, n):
3721 data = {
3722 'context': {
3723 'client': {
3724 'clientName': 'WEB',
3725 'clientVersion': '2.20201021.03.00',
3726 }
3727 },
3728 'query': query,
3729 }
3730 if self._SEARCH_PARAMS:
3731 data['params'] = self._SEARCH_PARAMS
3732 total = 0
3733 for page_num in itertools.count(1):
3734 search = self._download_json(
3735 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
3736 video_id='query "%s"' % query,
3737 note='Downloading page %s' % page_num,
3738 errnote='Unable to download API page', fatal=False,
3739 data=json.dumps(data).encode('utf8'),
3740 headers={'content-type': 'application/json'})
3741 if not search:
3742 break
3743 slr_contents = try_get(
3744 search,
3745 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
3746 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
3747 list)
3748 if not slr_contents:
3749 break
3750
3751 # Youtube sometimes adds promoted content to searches,
3752 # changing the index location of videos and token.
3753 # So we search through all entries till we find them.
3754 continuation_token = None
3755 for slr_content in slr_contents:
3756 isr_contents = try_get(
3757 slr_content,
3758 lambda x: x['itemSectionRenderer']['contents'],
3759 list)
3760 if not isr_contents:
3761 continue
3762 for content in isr_contents:
3763 if not isinstance(content, dict):
3764 continue
3765 video = content.get('videoRenderer')
3766 if not isinstance(video, dict):
3767 continue
3768 video_id = video.get('videoId')
3769 if not video_id:
3770 continue
3771
3772 yield self._extract_video(video)
3773 total += 1
3774 if total == n:
3775 return
3776
3777 if continuation_token is None:
3778 continuation_token = try_get(
3779 slr_content,
3780 lambda x: x['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
3781 compat_str)
3782
3783 if not continuation_token:
3784 break
3785 data['continuation'] = continuation_token
3786
3787 def _get_n_results(self, query, n):
3788 """Get a specified number of results for a query"""
3789 return self.playlist_result(self._entries(query, n), query)
3790
3791
3792 class YoutubeSearchDateIE(YoutubeSearchIE):
3793 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
3794 _SEARCH_KEY = 'ytsearchdate'
3795 IE_DESC = 'YouTube.com searches, newest videos first, "ytsearchdate" keyword'
3796 _SEARCH_PARAMS = 'CAI%3D'
3797
3798
3799 class YoutubeSearchURLIE(YoutubeSearchIE):
3800 IE_DESC = 'YouTube.com searches, "ytsearch" keyword'
3801 IE_NAME = YoutubeSearchIE.IE_NAME + '_url'
3802 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)'
3803 # _MAX_RESULTS = 100
3804 _TESTS = [{
3805 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
3806 'playlist_mincount': 5,
3807 'info_dict': {
3808 'title': 'youtube-dl test video',
3809 }
3810 }, {
3811 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
3812 'only_matching': True,
3813 }]
3814
3815 @classmethod
3816 def _make_valid_url(cls):
3817 return cls._VALID_URL
3818
3819 def _real_extract(self, url):
3820 qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
3821 query = (qs.get('search_query') or qs.get('q'))[0]
3822 self._SEARCH_PARAMS = qs.get('sp', ('',))[0]
3823 return self._get_n_results(query, self._MAX_RESULTS)
3824
3825
3826 class YoutubeFeedsInfoExtractor(YoutubeTabIE):
3827 """
3828 Base class for feed extractors
3829 Subclasses must define the _FEED_NAME property.
3830 """
3831 _LOGIN_REQUIRED = True
3832 # _MAX_PAGES = 5
3833 _TESTS = []
3834
3835 @property
3836 def IE_NAME(self):
3837 return 'youtube:%s' % self._FEED_NAME
3838
3839 def _real_initialize(self):
3840 self._login()
3841
3842 def _real_extract(self, url):
3843 return self.url_result(
3844 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
3845 ie=YoutubeTabIE.ie_key())
3846
3847
3848 class YoutubeWatchLaterIE(InfoExtractor):
3849 IE_NAME = 'youtube:watchlater'
3850 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
3851 _VALID_URL = r':ytwatchlater'
3852 _TESTS = [{
3853 'url': ':ytwatchlater',
3854 'only_matching': True,
3855 }]
3856
3857 def _real_extract(self, url):
3858 return self.url_result(
3859 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key())
3860
3861
3862 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3863 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
3864 _VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?'
3865 _FEED_NAME = 'recommended'
3866 _TESTS = [{
3867 'url': ':ytrec',
3868 'only_matching': True,
3869 }, {
3870 'url': ':ytrecommended',
3871 'only_matching': True,
3872 }, {
3873 'url': 'https://youtube.com',
3874 'only_matching': True,
3875 }]
3876
3877
3878 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
3879 IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)'
3880 _VALID_URL = r':ytsub(?:scription)?s?'
3881 _FEED_NAME = 'subscriptions'
3882 _TESTS = [{
3883 'url': ':ytsubs',
3884 'only_matching': True,
3885 }, {
3886 'url': ':ytsubscriptions',
3887 'only_matching': True,
3888 }]
3889
3890
3891 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
3892 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
3893 _VALID_URL = r':ythistory'
3894 _FEED_NAME = 'history'
3895 _TESTS = [{
3896 'url': ':ythistory',
3897 'only_matching': True,
3898 }]
3899
3900
3901 class YoutubeTruncatedURLIE(InfoExtractor):
3902 IE_NAME = 'youtube:truncated_url'
3903 IE_DESC = False # Do not list
3904 _VALID_URL = r'''(?x)
3905 (?:https?://)?
3906 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3907 (?:watch\?(?:
3908 feature=[a-z_]+|
3909 annotation_id=annotation_[^&]+|
3910 x-yt-cl=[0-9]+|
3911 hl=[^&]*|
3912 t=[0-9]+
3913 )?
3914 |
3915 attribution_link\?a=[^&]+
3916 )
3917 $
3918 '''
3919
3920 _TESTS = [{
3921 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
3922 'only_matching': True,
3923 }, {
3924 'url': 'https://www.youtube.com/watch?',
3925 'only_matching': True,
3926 }, {
3927 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3928 'only_matching': True,
3929 }, {
3930 'url': 'https://www.youtube.com/watch?feature=foo',
3931 'only_matching': True,
3932 }, {
3933 'url': 'https://www.youtube.com/watch?hl=en-GB',
3934 'only_matching': True,
3935 }, {
3936 'url': 'https://www.youtube.com/watch?t=2372',
3937 'only_matching': True,
3938 }]
3939
3940 def _real_extract(self, url):
3941 raise ExtractorError(
3942 'Did you forget to quote the URL? Remember that & is a meta '
3943 'character in most shells, so you want to put the URL in quotes, '
3944 'like youtube-dl '
3945 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3946 ' or simply youtube-dl BaW_jenozKc .',
3947 expected=True)
3948
3949
3950 class YoutubeTruncatedIDIE(InfoExtractor):
3951 IE_NAME = 'youtube:truncated_id'
3952 IE_DESC = False # Do not list
3953 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
3954
3955 _TESTS = [{
3956 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3957 'only_matching': True,
3958 }]
3959
3960 def _real_extract(self, url):
3961 video_id = self._match_id(url)
3962 raise ExtractorError(
3963 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3964 expected=True)
3965
3966
3967 # Do Youtube show urls even exist anymore? I couldn't find any
3968 r'''
3969 class YoutubeShowIE(YoutubeTabIE):
3970 IE_DESC = 'YouTube.com (multi-season) shows'
3971 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
3972 IE_NAME = 'youtube:show'
3973 _TESTS = [{
3974 'url': 'https://www.youtube.com/show/airdisasters',
3975 'playlist_mincount': 5,
3976 'info_dict': {
3977 'id': 'airdisasters',
3978 'title': 'Air Disasters',
3979 }
3980 }]
3981
3982 def _real_extract(self, url):
3983 playlist_id = self._match_id(url)
3984 return super(YoutubeShowIE, self)._real_extract(
3985 'https://www.youtube.com/show/%s/playlists' % playlist_id)
3986 '''