]> jfr.im git - yt-dlp.git/blame_incremental - youtube_dlc/extractor/youtube.py
Strip out internal fields such as `_filename` from infojson (Closes #42)
[yt-dlp.git] / youtube_dlc / extractor / youtube.py
... / ...
CommitLineData
1# coding: utf-8
2
3from __future__ import unicode_literals
4
5
6import itertools
7import json
8import os.path
9import random
10import re
11import time
12import traceback
13
14from .common import InfoExtractor, SearchInfoExtractor
15from ..jsinterp import JSInterpreter
16from ..swfinterp import SWFInterpreter
17from ..compat import (
18 compat_chr,
19 compat_HTTPError,
20 compat_kwargs,
21 compat_parse_qs,
22 compat_urllib_parse_unquote,
23 compat_urllib_parse_unquote_plus,
24 compat_urllib_parse_urlencode,
25 compat_urllib_parse_urlparse,
26 compat_urlparse,
27 compat_str,
28)
29from ..utils import (
30 bool_or_none,
31 clean_html,
32 error_to_compat_str,
33 ExtractorError,
34 float_or_none,
35 get_element_by_id,
36 int_or_none,
37 mimetype2ext,
38 parse_codecs,
39 parse_count,
40 parse_duration,
41 remove_quotes,
42 remove_start,
43 smuggle_url,
44 str_or_none,
45 str_to_int,
46 try_get,
47 unescapeHTML,
48 unified_strdate,
49 unsmuggle_url,
50 update_url_query,
51 uppercase_escape,
52 url_or_none,
53 urlencode_postdata,
54 urljoin,
55)
56
57
58class YoutubeBaseInfoExtractor(InfoExtractor):
59 """Provide base functions for Youtube extractors"""
60 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
61 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
62
63 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
64 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
65 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
66
67 _RESERVED_NAMES = (
68 r'embed|e|watch_popup|channel|c|user|playlist|watch|w|v|movies|results|shared|'
69 r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout|'
70 r'feed/(?:watch_later|history|subscriptions|library|trending|recommended)')
71
72 _NETRC_MACHINE = 'youtube'
73 # If True it will raise an error if no login info is provided
74 _LOGIN_REQUIRED = False
75
76 _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)'
77
78 def _set_language(self):
79 self._set_cookie(
80 '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
81 # YouTube sets the expire time to about two months
82 expire_time=time.time() + 2 * 30 * 24 * 3600)
83
84 def _ids_to_results(self, ids):
85 return [
86 self.url_result(vid_id, 'Youtube', video_id=vid_id)
87 for vid_id in ids]
88
89 def _login(self):
90 """
91 Attempt to log in to YouTube.
92 True is returned if successful or skipped.
93 False is returned if login failed.
94
95 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
96 """
97 username, password = self._get_login_info()
98 # No authentication to be performed
99 if username is None:
100 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
101 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
102 if self._downloader.params.get('cookiefile') and False: # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
103 self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
104 return True
105
106 login_page = self._download_webpage(
107 self._LOGIN_URL, None,
108 note='Downloading login page',
109 errnote='unable to fetch login page', fatal=False)
110 if login_page is False:
111 return
112
113 login_form = self._hidden_inputs(login_page)
114
115 def req(url, f_req, note, errnote):
116 data = login_form.copy()
117 data.update({
118 'pstMsg': 1,
119 'checkConnection': 'youtube',
120 'checkedDomains': 'youtube',
121 'hl': 'en',
122 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
123 'f.req': json.dumps(f_req),
124 'flowName': 'GlifWebSignIn',
125 'flowEntry': 'ServiceLogin',
126 # TODO: reverse actual botguard identifier generation algo
127 'bgRequest': '["identifier",""]',
128 })
129 return self._download_json(
130 url, None, note=note, errnote=errnote,
131 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
132 fatal=False,
133 data=urlencode_postdata(data), headers={
134 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
135 'Google-Accounts-XSRF': 1,
136 })
137
138 def warn(message):
139 self._downloader.report_warning(message)
140
141 lookup_req = [
142 username,
143 None, [], None, 'US', None, None, 2, False, True,
144 [
145 None, None,
146 [2, 1, None, 1,
147 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
148 None, [], 4],
149 1, [None, None, []], None, None, None, True
150 ],
151 username,
152 ]
153
154 lookup_results = req(
155 self._LOOKUP_URL, lookup_req,
156 'Looking up account info', 'Unable to look up account info')
157
158 if lookup_results is False:
159 return False
160
161 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
162 if not user_hash:
163 warn('Unable to extract user hash')
164 return False
165
166 challenge_req = [
167 user_hash,
168 None, 1, None, [1, None, None, None, [password, None, True]],
169 [
170 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
171 1, [None, None, []], None, None, None, True
172 ]]
173
174 challenge_results = req(
175 self._CHALLENGE_URL, challenge_req,
176 'Logging in', 'Unable to log in')
177
178 if challenge_results is False:
179 return
180
181 login_res = try_get(challenge_results, lambda x: x[0][5], list)
182 if login_res:
183 login_msg = try_get(login_res, lambda x: x[5], compat_str)
184 warn(
185 'Unable to login: %s' % 'Invalid password'
186 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
187 return False
188
189 res = try_get(challenge_results, lambda x: x[0][-1], list)
190 if not res:
191 warn('Unable to extract result entry')
192 return False
193
194 login_challenge = try_get(res, lambda x: x[0][0], list)
195 if login_challenge:
196 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
197 if challenge_str == 'TWO_STEP_VERIFICATION':
198 # SEND_SUCCESS - TFA code has been successfully sent to phone
199 # QUOTA_EXCEEDED - reached the limit of TFA codes
200 status = try_get(login_challenge, lambda x: x[5], compat_str)
201 if status == 'QUOTA_EXCEEDED':
202 warn('Exceeded the limit of TFA codes, try later')
203 return False
204
205 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
206 if not tl:
207 warn('Unable to extract TL')
208 return False
209
210 tfa_code = self._get_tfa_info('2-step verification code')
211
212 if not tfa_code:
213 warn(
214 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
215 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
216 return False
217
218 tfa_code = remove_start(tfa_code, 'G-')
219
220 tfa_req = [
221 user_hash, None, 2, None,
222 [
223 9, None, None, None, None, None, None, None,
224 [None, tfa_code, True, 2]
225 ]]
226
227 tfa_results = req(
228 self._TFA_URL.format(tl), tfa_req,
229 'Submitting TFA code', 'Unable to submit TFA code')
230
231 if tfa_results is False:
232 return False
233
234 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
235 if tfa_res:
236 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
237 warn(
238 'Unable to finish TFA: %s' % 'Invalid TFA code'
239 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
240 return False
241
242 check_cookie_url = try_get(
243 tfa_results, lambda x: x[0][-1][2], compat_str)
244 else:
245 CHALLENGES = {
246 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
247 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
248 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
249 }
250 challenge = CHALLENGES.get(
251 challenge_str,
252 '%s returned error %s.' % (self.IE_NAME, challenge_str))
253 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
254 return False
255 else:
256 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
257
258 if not check_cookie_url:
259 warn('Unable to extract CheckCookie URL')
260 return False
261
262 check_cookie_results = self._download_webpage(
263 check_cookie_url, None, 'Checking cookie', fatal=False)
264
265 if check_cookie_results is False:
266 return False
267
268 if 'https://myaccount.google.com/' not in check_cookie_results:
269 warn('Unable to log in')
270 return False
271
272 return True
273
274 def _download_webpage_handle(self, *args, **kwargs):
275 query = kwargs.get('query', {}).copy()
276 kwargs['query'] = query
277 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
278 *args, **compat_kwargs(kwargs))
279
280 def _real_initialize(self):
281 if self._downloader is None:
282 return
283 self._set_language()
284 if not self._login():
285 return
286
287 _DEFAULT_API_DATA = {
288 'context': {
289 'client': {
290 'clientName': 'WEB',
291 'clientVersion': '2.20201021.03.00',
292 }
293 },
294 }
295
296 _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
297 _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
298 _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
299
300 def _call_api(self, ep, query, video_id):
301 data = self._DEFAULT_API_DATA.copy()
302 data.update(query)
303
304 response = self._download_json(
305 'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id,
306 note='Downloading API JSON', errnote='Unable to download API page',
307 data=json.dumps(data).encode('utf8'),
308 headers={'content-type': 'application/json'},
309 query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'})
310
311 return response
312
313 def _extract_yt_initial_data(self, video_id, webpage):
314 return self._parse_json(
315 self._search_regex(
316 (r'%s\s*%s' % (self._YT_INITIAL_DATA_RE, self._YT_INITIAL_BOUNDARY_RE),
317 self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
318 video_id)
319
320 def _extract_ytcfg(self, video_id, webpage):
321 return self._parse_json(
322 self._search_regex(
323 r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg',
324 default='{}'), video_id, fatal=False)
325
326 def _extract_video(self, renderer):
327 video_id = renderer.get('videoId')
328 title = try_get(
329 renderer,
330 (lambda x: x['title']['runs'][0]['text'],
331 lambda x: x['title']['simpleText']), compat_str)
332 description = try_get(
333 renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'],
334 compat_str)
335 duration = parse_duration(try_get(
336 renderer, lambda x: x['lengthText']['simpleText'], compat_str))
337 view_count_text = try_get(
338 renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
339 view_count = str_to_int(self._search_regex(
340 r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
341 'view count', default=None))
342 uploader = try_get(
343 renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
344 return {
345 '_type': 'url_transparent',
346 'ie_key': YoutubeIE.ie_key(),
347 'id': video_id,
348 'url': video_id,
349 'title': title,
350 'description': description,
351 'duration': duration,
352 'view_count': view_count,
353 'uploader': uploader,
354 }
355
356
357class YoutubeIE(YoutubeBaseInfoExtractor):
358 IE_DESC = 'YouTube.com'
359 _VALID_URL = r"""(?x)^
360 (
361 (?:https?://|//) # http(s):// or protocol-independent URL
362 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
363 (?:www\.)?deturl\.com/www\.youtube\.com/|
364 (?:www\.)?pwnyoutube\.com/|
365 (?:www\.)?hooktube\.com/|
366 (?:www\.)?yourepeat\.com/|
367 tube\.majestyc\.net/|
368 # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
369 (?:(?:www|dev)\.)?invidio\.us/|
370 (?:(?:www|no)\.)?invidiou\.sh/|
371 (?:(?:www|fi)\.)?invidious\.snopyta\.org/|
372 (?:www\.)?invidious\.kabi\.tk/|
373 (?:www\.)?invidious\.13ad\.de/|
374 (?:www\.)?invidious\.mastodon\.host/|
375 (?:www\.)?invidious\.zapashcanon\.fr/|
376 (?:www\.)?invidious\.kavin\.rocks/|
377 (?:www\.)?invidious\.tube/|
378 (?:www\.)?invidiou\.site/|
379 (?:www\.)?invidious\.site/|
380 (?:www\.)?invidious\.xyz/|
381 (?:www\.)?invidious\.nixnet\.xyz/|
382 (?:www\.)?invidious\.drycat\.fr/|
383 (?:www\.)?tube\.poal\.co/|
384 (?:www\.)?tube\.connect\.cafe/|
385 (?:www\.)?vid\.wxzm\.sx/|
386 (?:www\.)?vid\.mint\.lgbt/|
387 (?:www\.)?yewtu\.be/|
388 (?:www\.)?yt\.elukerio\.org/|
389 (?:www\.)?yt\.lelux\.fi/|
390 (?:www\.)?invidious\.ggc-project\.de/|
391 (?:www\.)?yt\.maisputain\.ovh/|
392 (?:www\.)?invidious\.13ad\.de/|
393 (?:www\.)?invidious\.toot\.koeln/|
394 (?:www\.)?invidious\.fdn\.fr/|
395 (?:www\.)?watch\.nettohikari\.com/|
396 (?:www\.)?kgg2m7yk5aybusll\.onion/|
397 (?:www\.)?qklhadlycap4cnod\.onion/|
398 (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
399 (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
400 (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
401 (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
402 (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
403 (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
404 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
405 (?:.*?\#/)? # handle anchor (#/) redirect urls
406 (?: # the various things that can precede the ID:
407 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
408 |(?: # or the v= param in all its forms
409 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
410 (?:\?|\#!?) # the params delimiter ? or # or #!
411 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
412 v=
413 )
414 ))
415 |(?:
416 youtu\.be| # just youtu.be/xxxx
417 vid\.plus| # or vid.plus/xxxx
418 zwearz\.com/watch| # or zwearz.com/watch/xxxx
419 )/
420 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
421 )
422 )? # all until now is optional -> you can pass the naked ID
423 (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
424 (?!.*?\blist=
425 (?:
426 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
427 WL # WL are handled by the watch later IE
428 )
429 )
430 (?(1).+)? # if we found the ID, everything can follow
431 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
432 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
433 _PLAYER_INFO_RE = (
434 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P<ext>[a-z]+)$',
435 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.(?P<ext>[a-z]+)$',
436 )
437 _formats = {
438 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
439 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
440 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
441 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
442 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
443 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
444 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
445 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
446 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
447 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
448 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
449 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
450 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
451 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
452 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
453 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
454 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
455 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
456
457
458 # 3D videos
459 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
460 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
461 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
462 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
463 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
464 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
465 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
466
467 # Apple HTTP Live Streaming
468 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
469 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
470 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
471 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
472 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
473 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
474 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
475 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
476
477 # DASH mp4 video
478 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
479 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
480 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
481 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
482 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
483 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
484 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
485 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
486 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
487 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
488 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
489 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
490
491 # Dash mp4 audio
492 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
493 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
494 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
495 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
496 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
497 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
498 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
499
500 # Dash webm
501 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
502 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
503 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
504 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
505 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
506 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
507 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
508 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
509 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
510 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
511 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
512 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
513 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
514 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
515 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
516 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
517 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
518 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
519 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
520 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
521 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
522 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
523
524 # Dash webm audio
525 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
526 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
527
528 # Dash webm audio with opus inside
529 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
530 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
531 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
532
533 # RTMP (unnamed)
534 '_rtmp': {'protocol': 'rtmp'},
535
536 # av01 video only formats sometimes served with "unknown" codecs
537 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
538 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
539 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
540 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
541 }
542 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
543
544 _GEO_BYPASS = False
545
546 IE_NAME = 'youtube'
547 _TESTS = [
548 {
549 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
550 'info_dict': {
551 'id': 'BaW_jenozKc',
552 'ext': 'mp4',
553 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
554 'uploader': 'Philipp Hagemeister',
555 'uploader_id': 'phihag',
556 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
557 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
558 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
559 'upload_date': '20121002',
560 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
561 'categories': ['Science & Technology'],
562 'tags': ['youtube-dl'],
563 'duration': 10,
564 'view_count': int,
565 'like_count': int,
566 'dislike_count': int,
567 'start_time': 1,
568 'end_time': 9,
569 }
570 },
571 {
572 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
573 'note': 'Embed-only video (#1746)',
574 'info_dict': {
575 'id': 'yZIXLfi8CZQ',
576 'ext': 'mp4',
577 'upload_date': '20120608',
578 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
579 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
580 'uploader': 'SET India',
581 'uploader_id': 'setindia',
582 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
583 'age_limit': 18,
584 }
585 },
586 {
587 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ',
588 'note': 'Use the first video ID in the URL',
589 'info_dict': {
590 'id': 'BaW_jenozKc',
591 'ext': 'mp4',
592 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
593 'uploader': 'Philipp Hagemeister',
594 'uploader_id': 'phihag',
595 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
596 'upload_date': '20121002',
597 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
598 'categories': ['Science & Technology'],
599 'tags': ['youtube-dl'],
600 'duration': 10,
601 'view_count': int,
602 'like_count': int,
603 'dislike_count': int,
604 },
605 'params': {
606 'skip_download': True,
607 },
608 },
609 {
610 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
611 'note': '256k DASH audio (format 141) via DASH manifest',
612 'info_dict': {
613 'id': 'a9LDPn-MO4I',
614 'ext': 'm4a',
615 'upload_date': '20121002',
616 'uploader_id': '8KVIDEO',
617 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
618 'description': '',
619 'uploader': '8KVIDEO',
620 'title': 'UHDTV TEST 8K VIDEO.mp4'
621 },
622 'params': {
623 'youtube_include_dash_manifest': True,
624 'format': '141',
625 },
626 'skip': 'format 141 not served anymore',
627 },
628 # DASH manifest with encrypted signature
629 {
630 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
631 'info_dict': {
632 'id': 'IB3lcPjvWLA',
633 'ext': 'm4a',
634 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
635 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
636 'duration': 244,
637 'uploader': 'AfrojackVEVO',
638 'uploader_id': 'AfrojackVEVO',
639 'upload_date': '20131011',
640 },
641 'params': {
642 'youtube_include_dash_manifest': True,
643 'format': '141/bestaudio[ext=m4a]',
644 },
645 },
646 # Controversy video
647 {
648 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
649 'info_dict': {
650 'id': 'T4XJQO3qol8',
651 'ext': 'mp4',
652 'duration': 219,
653 'upload_date': '20100909',
654 'uploader': 'Amazing Atheist',
655 'uploader_id': 'TheAmazingAtheist',
656 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
657 'title': 'Burning Everyone\'s Koran',
658 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
659 }
660 },
661 # Normal age-gate video (embed allowed)
662 {
663 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
664 'info_dict': {
665 'id': 'HtVdAasjOgU',
666 'ext': 'mp4',
667 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
668 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
669 'duration': 142,
670 'uploader': 'The Witcher',
671 'uploader_id': 'WitcherGame',
672 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
673 'upload_date': '20140605',
674 'age_limit': 18,
675 },
676 },
677 # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
678 # YouTube Red ad is not captured for creator
679 {
680 'url': '__2ABJjxzNo',
681 'info_dict': {
682 'id': '__2ABJjxzNo',
683 'ext': 'mp4',
684 'duration': 266,
685 'upload_date': '20100430',
686 'uploader_id': 'deadmau5',
687 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
688 'creator': 'Dada Life, deadmau5',
689 'description': 'md5:12c56784b8032162bb936a5f76d55360',
690 'uploader': 'deadmau5',
691 'title': 'Deadmau5 - Some Chords (HD)',
692 'alt_title': 'This Machine Kills Some Chords',
693 },
694 'expected_warnings': [
695 'DASH manifest missing',
696 ]
697 },
698 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
699 {
700 'url': 'lqQg6PlCWgI',
701 'info_dict': {
702 'id': 'lqQg6PlCWgI',
703 'ext': 'mp4',
704 'duration': 6085,
705 'upload_date': '20150827',
706 'uploader_id': 'olympic',
707 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
708 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
709 'uploader': 'Olympic',
710 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
711 },
712 'params': {
713 'skip_download': 'requires avconv',
714 }
715 },
716 # Non-square pixels
717 {
718 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
719 'info_dict': {
720 'id': '_b-2C3KPAM0',
721 'ext': 'mp4',
722 'stretched_ratio': 16 / 9.,
723 'duration': 85,
724 'upload_date': '20110310',
725 'uploader_id': 'AllenMeow',
726 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
727 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
728 'uploader': '孫ᄋᄅ',
729 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
730 },
731 },
732 # url_encoded_fmt_stream_map is empty string
733 {
734 'url': 'qEJwOuvDf7I',
735 'info_dict': {
736 'id': 'qEJwOuvDf7I',
737 'ext': 'webm',
738 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
739 'description': '',
740 'upload_date': '20150404',
741 'uploader_id': 'spbelect',
742 'uploader': 'Наблюдатели Петербурга',
743 },
744 'params': {
745 'skip_download': 'requires avconv',
746 },
747 'skip': 'This live event has ended.',
748 },
749 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
750 {
751 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
752 'info_dict': {
753 'id': 'FIl7x6_3R5Y',
754 'ext': 'webm',
755 'title': 'md5:7b81415841e02ecd4313668cde88737a',
756 'description': 'md5:116377fd2963b81ec4ce64b542173306',
757 'duration': 220,
758 'upload_date': '20150625',
759 'uploader_id': 'dorappi2000',
760 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
761 'uploader': 'dorappi2000',
762 'formats': 'mincount:31',
763 },
764 'skip': 'not actual anymore',
765 },
766 # DASH manifest with segment_list
767 {
768 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
769 'md5': '8ce563a1d667b599d21064e982ab9e31',
770 'info_dict': {
771 'id': 'CsmdDsKjzN8',
772 'ext': 'mp4',
773 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
774 'uploader': 'Airtek',
775 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
776 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
777 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
778 },
779 'params': {
780 'youtube_include_dash_manifest': True,
781 'format': '135', # bestvideo
782 },
783 'skip': 'This live event has ended.',
784 },
785 {
786 # Multifeed videos (multiple cameras), URL is for Main Camera
787 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
788 'info_dict': {
789 'id': 'jqWvoWXjCVs',
790 'title': 'teamPGP: Rocket League Noob Stream',
791 'description': 'md5:dc7872fb300e143831327f1bae3af010',
792 },
793 'playlist': [{
794 'info_dict': {
795 'id': 'jqWvoWXjCVs',
796 'ext': 'mp4',
797 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
798 'description': 'md5:dc7872fb300e143831327f1bae3af010',
799 'duration': 7335,
800 'upload_date': '20150721',
801 'uploader': 'Beer Games Beer',
802 'uploader_id': 'beergamesbeer',
803 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
804 'license': 'Standard YouTube License',
805 },
806 }, {
807 'info_dict': {
808 'id': '6h8e8xoXJzg',
809 'ext': 'mp4',
810 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
811 'description': 'md5:dc7872fb300e143831327f1bae3af010',
812 'duration': 7337,
813 'upload_date': '20150721',
814 'uploader': 'Beer Games Beer',
815 'uploader_id': 'beergamesbeer',
816 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
817 'license': 'Standard YouTube License',
818 },
819 }, {
820 'info_dict': {
821 'id': 'PUOgX5z9xZw',
822 'ext': 'mp4',
823 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
824 'description': 'md5:dc7872fb300e143831327f1bae3af010',
825 'duration': 7337,
826 'upload_date': '20150721',
827 'uploader': 'Beer Games Beer',
828 'uploader_id': 'beergamesbeer',
829 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
830 'license': 'Standard YouTube License',
831 },
832 }, {
833 'info_dict': {
834 'id': 'teuwxikvS5k',
835 'ext': 'mp4',
836 'title': 'teamPGP: Rocket League Noob Stream (zim)',
837 'description': 'md5:dc7872fb300e143831327f1bae3af010',
838 'duration': 7334,
839 'upload_date': '20150721',
840 'uploader': 'Beer Games Beer',
841 'uploader_id': 'beergamesbeer',
842 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
843 'license': 'Standard YouTube License',
844 },
845 }],
846 'params': {
847 'skip_download': True,
848 },
849 'skip': 'This video is not available.',
850 },
851 {
852 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
853 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
854 'info_dict': {
855 'id': 'gVfLd0zydlo',
856 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
857 },
858 'playlist_count': 2,
859 'skip': 'Not multifeed anymore',
860 },
861 {
862 'url': 'https://vid.plus/FlRa-iH7PGw',
863 'only_matching': True,
864 },
865 {
866 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
867 'only_matching': True,
868 },
869 {
870 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
871 # Also tests cut-off URL expansion in video description (see
872 # https://github.com/ytdl-org/youtube-dl/issues/1892,
873 # https://github.com/ytdl-org/youtube-dl/issues/8164)
874 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
875 'info_dict': {
876 'id': 'lsguqyKfVQg',
877 'ext': 'mp4',
878 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
879 'alt_title': 'Dark Walk - Position Music',
880 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
881 'duration': 133,
882 'upload_date': '20151119',
883 'uploader_id': 'IronSoulElf',
884 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
885 'uploader': 'IronSoulElf',
886 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
887 'track': 'Dark Walk - Position Music',
888 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
889 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
890 },
891 'params': {
892 'skip_download': True,
893 },
894 },
895 {
896 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
897 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
898 'only_matching': True,
899 },
900 {
901 # Video with yt:stretch=17:0
902 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
903 'info_dict': {
904 'id': 'Q39EVAstoRM',
905 'ext': 'mp4',
906 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
907 'description': 'md5:ee18a25c350637c8faff806845bddee9',
908 'upload_date': '20151107',
909 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
910 'uploader': 'CH GAMER DROID',
911 },
912 'params': {
913 'skip_download': True,
914 },
915 'skip': 'This video does not exist.',
916 },
917 {
918 # Video licensed under Creative Commons
919 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
920 'info_dict': {
921 'id': 'M4gD1WSo5mA',
922 'ext': 'mp4',
923 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
924 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
925 'duration': 721,
926 'upload_date': '20150127',
927 'uploader_id': 'BerkmanCenter',
928 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
929 'uploader': 'The Berkman Klein Center for Internet & Society',
930 'license': 'Creative Commons Attribution license (reuse allowed)',
931 },
932 'params': {
933 'skip_download': True,
934 },
935 },
936 {
937 # Channel-like uploader_url
938 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
939 'info_dict': {
940 'id': 'eQcmzGIKrzg',
941 'ext': 'mp4',
942 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
943 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
944 'duration': 4060,
945 'upload_date': '20151119',
946 'uploader': 'Bernie Sanders',
947 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
948 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
949 'license': 'Creative Commons Attribution license (reuse allowed)',
950 },
951 'params': {
952 'skip_download': True,
953 },
954 },
955 {
956 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
957 'only_matching': True,
958 },
959 {
960 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
961 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
962 'only_matching': True,
963 },
964 {
965 # Rental video preview
966 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
967 'info_dict': {
968 'id': 'uGpuVWrhIzE',
969 'ext': 'mp4',
970 'title': 'Piku - Trailer',
971 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
972 'upload_date': '20150811',
973 'uploader': 'FlixMatrix',
974 'uploader_id': 'FlixMatrixKaravan',
975 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
976 'license': 'Standard YouTube License',
977 },
978 'params': {
979 'skip_download': True,
980 },
981 'skip': 'This video is not available.',
982 },
983 {
984 # YouTube Red video with episode data
985 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
986 'info_dict': {
987 'id': 'iqKdEhx-dD4',
988 'ext': 'mp4',
989 'title': 'Isolation - Mind Field (Ep 1)',
990 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
991 'duration': 2085,
992 'upload_date': '20170118',
993 'uploader': 'Vsauce',
994 'uploader_id': 'Vsauce',
995 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
996 'series': 'Mind Field',
997 'season_number': 1,
998 'episode_number': 1,
999 },
1000 'params': {
1001 'skip_download': True,
1002 },
1003 'expected_warnings': [
1004 'Skipping DASH manifest',
1005 ],
1006 },
1007 {
1008 # The following content has been identified by the YouTube community
1009 # as inappropriate or offensive to some audiences.
1010 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
1011 'info_dict': {
1012 'id': '6SJNVb0GnPI',
1013 'ext': 'mp4',
1014 'title': 'Race Differences in Intelligence',
1015 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
1016 'duration': 965,
1017 'upload_date': '20140124',
1018 'uploader': 'New Century Foundation',
1019 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
1020 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
1021 },
1022 'params': {
1023 'skip_download': True,
1024 },
1025 },
1026 {
1027 # itag 212
1028 'url': '1t24XAntNCY',
1029 'only_matching': True,
1030 },
1031 {
1032 # geo restricted to JP
1033 'url': 'sJL6WA-aGkQ',
1034 'only_matching': True,
1035 },
1036 {
1037 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1038 'only_matching': True,
1039 },
1040 {
1041 # DRM protected
1042 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1043 'only_matching': True,
1044 },
1045 {
1046 # Video with unsupported adaptive stream type formats
1047 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1048 'info_dict': {
1049 'id': 'Z4Vy8R84T1U',
1050 'ext': 'mp4',
1051 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1052 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1053 'duration': 433,
1054 'upload_date': '20130923',
1055 'uploader': 'Amelia Putri Harwita',
1056 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1057 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1058 'formats': 'maxcount:10',
1059 },
1060 'params': {
1061 'skip_download': True,
1062 'youtube_include_dash_manifest': False,
1063 },
1064 'skip': 'not actual anymore',
1065 },
1066 {
1067 # Youtube Music Auto-generated description
1068 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1069 'info_dict': {
1070 'id': 'MgNrAu2pzNs',
1071 'ext': 'mp4',
1072 'title': 'Voyeur Girl',
1073 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1074 'upload_date': '20190312',
1075 'uploader': 'Stephen - Topic',
1076 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
1077 'artist': 'Stephen',
1078 'track': 'Voyeur Girl',
1079 'album': 'it\'s too much love to know my dear',
1080 'release_date': '20190313',
1081 'release_year': 2019,
1082 },
1083 'params': {
1084 'skip_download': True,
1085 },
1086 },
1087 {
1088 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1089 'only_matching': True,
1090 },
1091 {
1092 # invalid -> valid video id redirection
1093 'url': 'DJztXj2GPfl',
1094 'info_dict': {
1095 'id': 'DJztXj2GPfk',
1096 'ext': 'mp4',
1097 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1098 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1099 'upload_date': '20090125',
1100 'uploader': 'Prochorowka',
1101 'uploader_id': 'Prochorowka',
1102 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1103 'artist': 'Panjabi MC',
1104 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1105 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1106 },
1107 'params': {
1108 'skip_download': True,
1109 },
1110 },
1111 {
1112 # empty description results in an empty string
1113 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1114 'info_dict': {
1115 'id': 'x41yOUIvK2k',
1116 'ext': 'mp4',
1117 'title': 'IMG 3456',
1118 'description': '',
1119 'upload_date': '20170613',
1120 'uploader_id': 'ElevageOrVert',
1121 'uploader': 'ElevageOrVert',
1122 },
1123 'params': {
1124 'skip_download': True,
1125 },
1126 },
1127 {
1128 # with '};' inside yt initial data (see [1])
1129 # see [2] for an example with '};' inside ytInitialPlayerResponse
1130 # 1. https://github.com/ytdl-org/youtube-dl/issues/27093
1131 # 2. https://github.com/ytdl-org/youtube-dl/issues/27216
1132 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no',
1133 'info_dict': {
1134 'id': 'CHqg6qOn4no',
1135 'ext': 'mp4',
1136 'title': 'Part 77 Sort a list of simple types in c#',
1137 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc',
1138 'upload_date': '20130831',
1139 'uploader_id': 'kudvenkat',
1140 'uploader': 'kudvenkat',
1141 },
1142 'params': {
1143 'skip_download': True,
1144 },
1145 },
1146 {
1147 # another example of '};' in ytInitialData
1148 'url': 'https://www.youtube.com/watch?v=gVfgbahppCY',
1149 'only_matching': True,
1150 },
1151 {
1152 'url': 'https://www.youtube.com/watch_popup?v=63RmMXCd_bQ',
1153 'only_matching': True,
1154 },
1155 ]
1156
1157 def __init__(self, *args, **kwargs):
1158 super(YoutubeIE, self).__init__(*args, **kwargs)
1159 self._player_cache = {}
1160
1161 def report_video_info_webpage_download(self, video_id):
1162 """Report attempt to download video info webpage."""
1163 self.to_screen('%s: Downloading video info webpage' % video_id)
1164
1165 def report_information_extraction(self, video_id):
1166 """Report attempt to extract video information."""
1167 self.to_screen('%s: Extracting video information' % video_id)
1168
1169 def report_unavailable_format(self, video_id, format):
1170 """Report extracted video URL."""
1171 self.to_screen('%s: Format %s not available' % (video_id, format))
1172
1173 def report_rtmp_download(self):
1174 """Indicate the download will use the RTMP protocol."""
1175 self.to_screen('RTMP download detected')
1176
1177 def _signature_cache_id(self, example_sig):
1178 """ Return a string representation of a signature """
1179 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
1180
1181 @classmethod
1182 def _extract_player_info(cls, player_url):
1183 for player_re in cls._PLAYER_INFO_RE:
1184 id_m = re.search(player_re, player_url)
1185 if id_m:
1186 break
1187 else:
1188 raise ExtractorError('Cannot identify player %r' % player_url)
1189 return id_m.group('ext'), id_m.group('id')
1190
1191 def _extract_signature_function(self, video_id, player_url, example_sig):
1192 player_type, player_id = self._extract_player_info(player_url)
1193
1194 # Read from filesystem cache
1195 func_id = '%s_%s_%s' % (
1196 player_type, player_id, self._signature_cache_id(example_sig))
1197 assert os.path.basename(func_id) == func_id
1198
1199 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
1200 if cache_spec is not None:
1201 return lambda s: ''.join(s[i] for i in cache_spec)
1202
1203 download_note = (
1204 'Downloading player %s' % player_url
1205 if self._downloader.params.get('verbose') else
1206 'Downloading %s player %s' % (player_type, player_id)
1207 )
1208 if player_type == 'js':
1209 code = self._download_webpage(
1210 player_url, video_id,
1211 note=download_note,
1212 errnote='Download of %s failed' % player_url)
1213 res = self._parse_sig_js(code)
1214 elif player_type == 'swf':
1215 urlh = self._request_webpage(
1216 player_url, video_id,
1217 note=download_note,
1218 errnote='Download of %s failed' % player_url)
1219 code = urlh.read()
1220 res = self._parse_sig_swf(code)
1221 else:
1222 assert False, 'Invalid player type %r' % player_type
1223
1224 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1225 cache_res = res(test_string)
1226 cache_spec = [ord(c) for c in cache_res]
1227
1228 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
1229 return res
1230
1231 def _print_sig_code(self, func, example_sig):
1232 def gen_sig_code(idxs):
1233 def _genslice(start, end, step):
1234 starts = '' if start == 0 else str(start)
1235 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
1236 steps = '' if step == 1 else (':%d' % step)
1237 return 's[%s%s%s]' % (starts, ends, steps)
1238
1239 step = None
1240 # Quelch pyflakes warnings - start will be set when step is set
1241 start = '(Never used)'
1242 for i, prev in zip(idxs[1:], idxs[:-1]):
1243 if step is not None:
1244 if i - prev == step:
1245 continue
1246 yield _genslice(start, prev, step)
1247 step = None
1248 continue
1249 if i - prev in [-1, 1]:
1250 step = i - prev
1251 start = prev
1252 continue
1253 else:
1254 yield 's[%d]' % prev
1255 if step is None:
1256 yield 's[%d]' % i
1257 else:
1258 yield _genslice(start, i, step)
1259
1260 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1261 cache_res = func(test_string)
1262 cache_spec = [ord(c) for c in cache_res]
1263 expr_code = ' + '.join(gen_sig_code(cache_spec))
1264 signature_id_tuple = '(%s)' % (
1265 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
1266 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
1267 ' return %s\n') % (signature_id_tuple, expr_code)
1268 self.to_screen('Extracted signature function:\n' + code)
1269
1270 def _parse_sig_js(self, jscode):
1271 funcname = self._search_regex(
1272 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1273 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1274 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1275 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1276 # Obsolete patterns
1277 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1278 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
1279 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1280 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1281 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1282 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1283 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1284 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
1285 jscode, 'Initial JS player signature function name', group='sig')
1286
1287 jsi = JSInterpreter(jscode)
1288 initial_function = jsi.extract_function(funcname)
1289 return lambda s: initial_function([s])
1290
1291 def _parse_sig_swf(self, file_contents):
1292 swfi = SWFInterpreter(file_contents)
1293 TARGET_CLASSNAME = 'SignatureDecipher'
1294 searched_class = swfi.extract_class(TARGET_CLASSNAME)
1295 initial_function = swfi.extract_function(searched_class, 'decipher')
1296 return lambda s: initial_function([s])
1297
1298 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1299 """Turn the encrypted s field into a working signature"""
1300
1301 if player_url is None:
1302 raise ExtractorError('Cannot decrypt signature without player_url')
1303
1304 if player_url.startswith('//'):
1305 player_url = 'https:' + player_url
1306 elif not re.match(r'https?://', player_url):
1307 player_url = compat_urlparse.urljoin(
1308 'https://www.youtube.com', player_url)
1309 try:
1310 player_id = (player_url, self._signature_cache_id(s))
1311 if player_id not in self._player_cache:
1312 func = self._extract_signature_function(
1313 video_id, player_url, s
1314 )
1315 self._player_cache[player_id] = func
1316 func = self._player_cache[player_id]
1317 if self._downloader.params.get('youtube_print_sig_code'):
1318 self._print_sig_code(func, s)
1319 return func(s)
1320 except Exception as e:
1321 tb = traceback.format_exc()
1322 raise ExtractorError(
1323 'Signature extraction failed: ' + tb, cause=e)
1324
1325 def _get_subtitles(self, video_id, webpage, has_live_chat_replay):
1326 try:
1327 subs_doc = self._download_xml(
1328 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1329 video_id, note=False)
1330 except ExtractorError as err:
1331 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
1332 return {}
1333
1334 sub_lang_list = {}
1335 for track in subs_doc.findall('track'):
1336 lang = track.attrib['lang_code']
1337 if lang in sub_lang_list:
1338 continue
1339 sub_formats = []
1340 for ext in self._SUBTITLE_FORMATS:
1341 params = compat_urllib_parse_urlencode({
1342 'lang': lang,
1343 'v': video_id,
1344 'fmt': ext,
1345 'name': track.attrib['name'].encode('utf-8'),
1346 })
1347 sub_formats.append({
1348 'url': 'https://www.youtube.com/api/timedtext?' + params,
1349 'ext': ext,
1350 })
1351 sub_lang_list[lang] = sub_formats
1352 if has_live_chat_replay:
1353 sub_lang_list['live_chat'] = [
1354 {
1355 'video_id': video_id,
1356 'ext': 'json',
1357 'protocol': 'youtube_live_chat_replay',
1358 },
1359 ]
1360 if not sub_lang_list:
1361 self._downloader.report_warning('video doesn\'t have subtitles')
1362 return {}
1363 return sub_lang_list
1364
1365 def _get_ytplayer_config(self, video_id, webpage):
1366 patterns = (
1367 # User data may contain arbitrary character sequences that may affect
1368 # JSON extraction with regex, e.g. when '};' is contained the second
1369 # regex won't capture the whole JSON. Yet working around by trying more
1370 # concrete regex first keeping in mind proper quoted string handling
1371 # to be implemented in future that will replace this workaround (see
1372 # https://github.com/ytdl-org/youtube-dl/issues/7468,
1373 # https://github.com/ytdl-org/youtube-dl/pull/7599)
1374 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1375 r';ytplayer\.config\s*=\s*({.+?});',
1376 )
1377 config = self._search_regex(
1378 patterns, webpage, 'ytplayer.config', default=None)
1379 if config:
1380 return self._parse_json(
1381 uppercase_escape(config), video_id, fatal=False)
1382
1383 def _get_automatic_captions(self, video_id, player_response, player_config):
1384 """We need the webpage for getting the captions url, pass it as an
1385 argument to speed up the process."""
1386 self.to_screen('%s: Looking for automatic captions' % video_id)
1387 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
1388 if not (player_response or player_config):
1389 self._downloader.report_warning(err_msg)
1390 return {}
1391 try:
1392 args = player_config.get('args') if player_config else {}
1393 caption_url = args.get('ttsurl')
1394 if caption_url:
1395 timestamp = args['timestamp']
1396 # We get the available subtitles
1397 list_params = compat_urllib_parse_urlencode({
1398 'type': 'list',
1399 'tlangs': 1,
1400 'asrs': 1,
1401 })
1402 list_url = caption_url + '&' + list_params
1403 caption_list = self._download_xml(list_url, video_id)
1404 original_lang_node = caption_list.find('track')
1405 if original_lang_node is None:
1406 self._downloader.report_warning('Video doesn\'t have automatic captions')
1407 return {}
1408 original_lang = original_lang_node.attrib['lang_code']
1409 caption_kind = original_lang_node.attrib.get('kind', '')
1410
1411 sub_lang_list = {}
1412 for lang_node in caption_list.findall('target'):
1413 sub_lang = lang_node.attrib['lang_code']
1414 sub_formats = []
1415 for ext in self._SUBTITLE_FORMATS:
1416 params = compat_urllib_parse_urlencode({
1417 'lang': original_lang,
1418 'tlang': sub_lang,
1419 'fmt': ext,
1420 'ts': timestamp,
1421 'kind': caption_kind,
1422 })
1423 sub_formats.append({
1424 'url': caption_url + '&' + params,
1425 'ext': ext,
1426 })
1427 sub_lang_list[sub_lang] = sub_formats
1428 return sub_lang_list
1429
1430 def make_captions(sub_url, sub_langs):
1431 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1432 caption_qs = compat_parse_qs(parsed_sub_url.query)
1433 captions = {}
1434 for sub_lang in sub_langs:
1435 sub_formats = []
1436 for ext in self._SUBTITLE_FORMATS:
1437 caption_qs.update({
1438 'tlang': [sub_lang],
1439 'fmt': [ext],
1440 })
1441 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1442 query=compat_urllib_parse_urlencode(caption_qs, True)))
1443 sub_formats.append({
1444 'url': sub_url,
1445 'ext': ext,
1446 })
1447 captions[sub_lang] = sub_formats
1448 return captions
1449
1450 # New captions format as of 22.06.2017
1451 if player_response:
1452 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1453 base_url = renderer['captionTracks'][0]['baseUrl']
1454 sub_lang_list = []
1455 for lang in renderer['translationLanguages']:
1456 lang_code = lang.get('languageCode')
1457 if lang_code:
1458 sub_lang_list.append(lang_code)
1459 return make_captions(base_url, sub_lang_list)
1460
1461 # Some videos don't provide ttsurl but rather caption_tracks and
1462 # caption_translation_languages (e.g. 20LmZk1hakA)
1463 # Does not used anymore as of 22.06.2017
1464 caption_tracks = args['caption_tracks']
1465 caption_translation_languages = args['caption_translation_languages']
1466 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
1467 sub_lang_list = []
1468 for lang in caption_translation_languages.split(','):
1469 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1470 sub_lang = lang_qs.get('lc', [None])[0]
1471 if sub_lang:
1472 sub_lang_list.append(sub_lang)
1473 return make_captions(caption_url, sub_lang_list)
1474 # An extractor error can be raise by the download process if there are
1475 # no automatic captions but there are subtitles
1476 except (KeyError, IndexError, ExtractorError):
1477 self._downloader.report_warning(err_msg)
1478 return {}
1479
1480 def _mark_watched(self, video_id, video_info, player_response):
1481 playback_url = url_or_none(try_get(
1482 player_response,
1483 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
1484 video_info, lambda x: x['videostats_playback_base_url'][0]))
1485 if not playback_url:
1486 return
1487 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1488 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1489
1490 # cpn generation algorithm is reverse engineered from base.js.
1491 # In fact it works even with dummy cpn.
1492 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1493 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1494
1495 qs.update({
1496 'ver': ['2'],
1497 'cpn': [cpn],
1498 })
1499 playback_url = compat_urlparse.urlunparse(
1500 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
1501
1502 self._download_webpage(
1503 playback_url, video_id, 'Marking watched',
1504 'Unable to mark watched', fatal=False)
1505
1506 @staticmethod
1507 def _extract_urls(webpage):
1508 # Embedded YouTube player
1509 entries = [
1510 unescapeHTML(mobj.group('url'))
1511 for mobj in re.finditer(r'''(?x)
1512 (?:
1513 <iframe[^>]+?src=|
1514 data-video-url=|
1515 <embed[^>]+?src=|
1516 embedSWF\(?:\s*|
1517 <object[^>]+data=|
1518 new\s+SWFObject\(
1519 )
1520 (["\'])
1521 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1522 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
1523 \1''', webpage)]
1524
1525 # lazyYT YouTube embed
1526 entries.extend(list(map(
1527 unescapeHTML,
1528 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1529
1530 # Wordpress "YouTube Video Importer" plugin
1531 matches = re.findall(r'''(?x)<div[^>]+
1532 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1533 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1534 entries.extend(m[-1] for m in matches)
1535
1536 return entries
1537
1538 @staticmethod
1539 def _extract_url(webpage):
1540 urls = YoutubeIE._extract_urls(webpage)
1541 return urls[0] if urls else None
1542
1543 @classmethod
1544 def extract_id(cls, url):
1545 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1546 if mobj is None:
1547 raise ExtractorError('Invalid URL: %s' % url)
1548 video_id = mobj.group(2)
1549 return video_id
1550
1551 def _extract_chapters_from_json(self, webpage, video_id, duration):
1552 if not webpage:
1553 return
1554 data = self._extract_yt_initial_data(video_id, webpage)
1555 if not data or not isinstance(data, dict):
1556 return
1557 chapters_list = try_get(
1558 data,
1559 lambda x: x['playerOverlays']
1560 ['playerOverlayRenderer']
1561 ['decoratedPlayerBarRenderer']
1562 ['decoratedPlayerBarRenderer']
1563 ['playerBar']
1564 ['chapteredPlayerBarRenderer']
1565 ['chapters'],
1566 list)
1567 if not chapters_list:
1568 return
1569
1570 def chapter_time(chapter):
1571 return float_or_none(
1572 try_get(
1573 chapter,
1574 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1575 int),
1576 scale=1000)
1577 chapters = []
1578 for next_num, chapter in enumerate(chapters_list, start=1):
1579 start_time = chapter_time(chapter)
1580 if start_time is None:
1581 continue
1582 end_time = (chapter_time(chapters_list[next_num])
1583 if next_num < len(chapters_list) else duration)
1584 if end_time is None:
1585 continue
1586 title = try_get(
1587 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1588 compat_str)
1589 chapters.append({
1590 'start_time': start_time,
1591 'end_time': end_time,
1592 'title': title,
1593 })
1594 return chapters
1595
1596 @staticmethod
1597 def _extract_chapters_from_description(description, duration):
1598 if not description:
1599 return None
1600 chapter_lines = re.findall(
1601 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1602 description)
1603 if not chapter_lines:
1604 return None
1605 chapters = []
1606 for next_num, (chapter_line, time_point) in enumerate(
1607 chapter_lines, start=1):
1608 start_time = parse_duration(time_point)
1609 if start_time is None:
1610 continue
1611 if start_time > duration:
1612 break
1613 end_time = (duration if next_num == len(chapter_lines)
1614 else parse_duration(chapter_lines[next_num][1]))
1615 if end_time is None:
1616 continue
1617 if end_time > duration:
1618 end_time = duration
1619 if start_time > end_time:
1620 break
1621 chapter_title = re.sub(
1622 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1623 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1624 chapters.append({
1625 'start_time': start_time,
1626 'end_time': end_time,
1627 'title': chapter_title,
1628 })
1629 return chapters
1630
1631 def _extract_chapters(self, webpage, description, video_id, duration):
1632 return (self._extract_chapters_from_json(webpage, video_id, duration)
1633 or self._extract_chapters_from_description(description, duration))
1634
1635 def _real_extract(self, url):
1636 url, smuggled_data = unsmuggle_url(url, {})
1637
1638 proto = (
1639 'http' if self._downloader.params.get('prefer_insecure', False)
1640 else 'https')
1641
1642 start_time = None
1643 end_time = None
1644 parsed_url = compat_urllib_parse_urlparse(url)
1645 for component in [parsed_url.fragment, parsed_url.query]:
1646 query = compat_parse_qs(component)
1647 if start_time is None and 't' in query:
1648 start_time = parse_duration(query['t'][0])
1649 if start_time is None and 'start' in query:
1650 start_time = parse_duration(query['start'][0])
1651 if end_time is None and 'end' in query:
1652 end_time = parse_duration(query['end'][0])
1653
1654 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1655 mobj = re.search(self._NEXT_URL_RE, url)
1656 if mobj:
1657 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
1658 video_id = self.extract_id(url)
1659
1660 # Get video webpage
1661 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
1662 video_webpage, urlh = self._download_webpage_handle(url, video_id)
1663
1664 qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query)
1665 video_id = qs.get('v', [None])[0] or video_id
1666
1667 # Attempt to extract SWF player URL
1668 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1669 if mobj is not None:
1670 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1671 else:
1672 player_url = None
1673
1674 dash_mpds = []
1675
1676 def add_dash_mpd(video_info):
1677 dash_mpd = video_info.get('dashmpd')
1678 if dash_mpd and dash_mpd[0] not in dash_mpds:
1679 dash_mpds.append(dash_mpd[0])
1680
1681 def add_dash_mpd_pr(pl_response):
1682 dash_mpd = url_or_none(try_get(
1683 pl_response, lambda x: x['streamingData']['dashManifestUrl'],
1684 compat_str))
1685 if dash_mpd and dash_mpd not in dash_mpds:
1686 dash_mpds.append(dash_mpd)
1687
1688 is_live = None
1689 view_count = None
1690
1691 def extract_view_count(v_info):
1692 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1693
1694 def extract_player_response(player_response, video_id):
1695 pl_response = str_or_none(player_response)
1696 if not pl_response:
1697 return
1698 pl_response = self._parse_json(pl_response, video_id, fatal=False)
1699 if isinstance(pl_response, dict):
1700 add_dash_mpd_pr(pl_response)
1701 return pl_response
1702
1703 def extract_embedded_config(embed_webpage, video_id):
1704 embedded_config = self._search_regex(
1705 r'setConfig\(({.*})\);',
1706 embed_webpage, 'ytInitialData', default=None)
1707 if embedded_config:
1708 return embedded_config
1709
1710 video_info = {}
1711 player_response = {}
1712 ytplayer_config = None
1713 embed_webpage = None
1714
1715 # Get video info
1716 if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+'
1717 or re.search(r'player-age-gate-content">', video_webpage) is not None):
1718 cookie_keys = self._get_cookies('https://www.youtube.com').keys()
1719 age_gate = True
1720 # We simulate the access to the video from www.youtube.com/v/{video_id}
1721 # this can be viewed without login into Youtube
1722 url = proto + '://www.youtube.com/embed/%s' % video_id
1723 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
1724 ext = extract_embedded_config(embed_webpage, video_id)
1725 # playabilityStatus = re.search(r'{\\\"status\\\":\\\"(?P<playabilityStatus>[^\"]+)\\\"', ext)
1726 playable_in_embed = re.search(r'{\\\"playableInEmbed\\\":(?P<playableinEmbed>[^\,]+)', ext)
1727 if not playable_in_embed:
1728 self.to_screen('Could not determine whether playabale in embed for video %s' % video_id)
1729 playable_in_embed = ''
1730 else:
1731 playable_in_embed = playable_in_embed.group('playableinEmbed')
1732 # check if video is only playable on youtube in other words not playable in embed - if so it requires auth (cookies)
1733 # if re.search(r'player-unavailable">', embed_webpage) is not None:
1734 if playable_in_embed == 'false':
1735 '''
1736 # TODO apply this patch when Support for Python 2.6(!) and above drops
1737 if ({'VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID'} <= cookie_keys
1738 or {'VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO'} <= cookie_keys):
1739 '''
1740 if (set(('VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID')) <= set(cookie_keys)
1741 or set(('VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO')) <= set(cookie_keys)):
1742 age_gate = False
1743 # Try looking directly into the video webpage
1744 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1745 if ytplayer_config:
1746 args = ytplayer_config.get("args")
1747 if args is not None:
1748 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1749 # Convert to the same format returned by compat_parse_qs
1750 video_info = dict((k, [v]) for k, v in args.items())
1751 add_dash_mpd(video_info)
1752 # Rental video is not rented but preview is available (e.g.
1753 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1754 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1755 if not video_info and args.get('ypc_vid'):
1756 return self.url_result(
1757 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1758 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1759 is_live = True
1760 if not player_response:
1761 player_response = extract_player_response(args.get('player_response'), video_id)
1762 elif not player_response:
1763 player_response = ytplayer_config
1764 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1765 add_dash_mpd_pr(player_response)
1766 else:
1767 raise ExtractorError('Video is age restricted and only playable on Youtube. Requires cookies!', expected=True)
1768 else:
1769 data = compat_urllib_parse_urlencode({
1770 'video_id': video_id,
1771 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1772 'sts': self._search_regex(
1773 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1774 })
1775 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1776 try:
1777 video_info_webpage = self._download_webpage(
1778 video_info_url, video_id,
1779 note='Refetching age-gated info webpage',
1780 errnote='unable to download video info webpage')
1781 except ExtractorError:
1782 video_info_webpage = None
1783 if video_info_webpage:
1784 video_info = compat_parse_qs(video_info_webpage)
1785 pl_response = video_info.get('player_response', [None])[0]
1786 player_response = extract_player_response(pl_response, video_id)
1787 add_dash_mpd(video_info)
1788 view_count = extract_view_count(video_info)
1789 else:
1790 age_gate = False
1791 # Try looking directly into the video webpage
1792 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1793 if ytplayer_config:
1794 args = ytplayer_config.get('args', {})
1795 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1796 # Convert to the same format returned by compat_parse_qs
1797 video_info = dict((k, [v]) for k, v in args.items())
1798 add_dash_mpd(video_info)
1799 # Rental video is not rented but preview is available (e.g.
1800 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1801 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1802 if not video_info and args.get('ypc_vid'):
1803 return self.url_result(
1804 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1805 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1806 is_live = True
1807 if not player_response:
1808 player_response = extract_player_response(args.get('player_response'), video_id)
1809 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1810 add_dash_mpd_pr(player_response)
1811
1812 if not video_info and not player_response:
1813 player_response = extract_player_response(
1814 self._search_regex(
1815 (r'%s\s*%s' % (self._YT_INITIAL_PLAYER_RESPONSE_RE, self._YT_INITIAL_BOUNDARY_RE),
1816 self._YT_INITIAL_PLAYER_RESPONSE_RE), video_webpage,
1817 'initial player response', default='{}'),
1818 video_id)
1819
1820 def extract_unavailable_message():
1821 messages = []
1822 for tag, kind in (('h1', 'message'), ('div', 'submessage')):
1823 msg = self._html_search_regex(
1824 r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
1825 video_webpage, 'unavailable %s' % kind, default=None)
1826 if msg:
1827 messages.append(msg)
1828 if messages:
1829 return '\n'.join(messages)
1830
1831 if not video_info and not player_response:
1832 unavailable_message = extract_unavailable_message()
1833 if not unavailable_message:
1834 unavailable_message = 'Unable to extract video data'
1835 raise ExtractorError(
1836 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
1837
1838 if not isinstance(video_info, dict):
1839 video_info = {}
1840
1841 playable_in_embed = try_get(
1842 player_response, lambda x: x['playabilityStatus']['playableInEmbed'])
1843
1844 video_details = try_get(
1845 player_response, lambda x: x['videoDetails'], dict) or {}
1846
1847 microformat = try_get(
1848 player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}
1849
1850 video_title = video_info.get('title', [None])[0] or video_details.get('title')
1851 if not video_title:
1852 self._downloader.report_warning('Unable to extract video title')
1853 video_title = '_'
1854
1855 description_original = video_description = get_element_by_id("eow-description", video_webpage)
1856 if video_description:
1857
1858 def replace_url(m):
1859 redir_url = compat_urlparse.urljoin(url, m.group(1))
1860 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1861 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1862 qs = compat_parse_qs(parsed_redir_url.query)
1863 q = qs.get('q')
1864 if q and q[0]:
1865 return q[0]
1866 return redir_url
1867
1868 description_original = video_description = re.sub(r'''(?x)
1869 <a\s+
1870 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1871 (?:title|href)="([^"]+)"\s+
1872 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1873 class="[^"]*"[^>]*>
1874 [^<]+\.{3}\s*
1875 </a>
1876 ''', replace_url, video_description)
1877 video_description = clean_html(video_description)
1878 else:
1879 video_description = video_details.get('shortDescription')
1880 if video_description is None:
1881 video_description = self._html_search_meta('description', video_webpage)
1882
1883 if not smuggled_data.get('force_singlefeed', False):
1884 if not self._downloader.params.get('noplaylist'):
1885 multifeed_metadata_list = try_get(
1886 player_response,
1887 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
1888 compat_str) or try_get(
1889 video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
1890 if multifeed_metadata_list:
1891 entries = []
1892 feed_ids = []
1893 for feed in multifeed_metadata_list.split(','):
1894 # Unquote should take place before split on comma (,) since textual
1895 # fields may contain comma as well (see
1896 # https://github.com/ytdl-org/youtube-dl/issues/8536)
1897 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
1898
1899 def feed_entry(name):
1900 return try_get(feed_data, lambda x: x[name][0], compat_str)
1901
1902 feed_id = feed_entry('id')
1903 if not feed_id:
1904 continue
1905 feed_title = feed_entry('title')
1906 title = video_title
1907 if feed_title:
1908 title += ' (%s)' % feed_title
1909 entries.append({
1910 '_type': 'url_transparent',
1911 'ie_key': 'Youtube',
1912 'url': smuggle_url(
1913 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1914 {'force_singlefeed': True}),
1915 'title': title,
1916 })
1917 feed_ids.append(feed_id)
1918 self.to_screen(
1919 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1920 % (', '.join(feed_ids), video_id))
1921 return self.playlist_result(entries, video_id, video_title, video_description)
1922 else:
1923 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1924
1925 if view_count is None:
1926 view_count = extract_view_count(video_info)
1927 if view_count is None and video_details:
1928 view_count = int_or_none(video_details.get('viewCount'))
1929 if view_count is None and microformat:
1930 view_count = int_or_none(microformat.get('viewCount'))
1931
1932 if is_live is None:
1933 is_live = bool_or_none(video_details.get('isLive'))
1934
1935 has_live_chat_replay = False
1936 if not is_live:
1937 yt_initial_data = self._extract_yt_initial_data(video_id, video_webpage)
1938 try:
1939 yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
1940 has_live_chat_replay = True
1941 except (KeyError, IndexError, TypeError):
1942 pass
1943
1944 # Check for "rental" videos
1945 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1946 raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
1947
1948 def _extract_filesize(media_url):
1949 return int_or_none(self._search_regex(
1950 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
1951
1952 streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
1953 streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
1954
1955 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1956 self.report_rtmp_download()
1957 formats = [{
1958 'format_id': '_rtmp',
1959 'protocol': 'rtmp',
1960 'url': video_info['conn'][0],
1961 'player_url': player_url,
1962 }]
1963 elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
1964 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
1965 if 'rtmpe%3Dyes' in encoded_url_map:
1966 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
1967 formats = []
1968 formats_spec = {}
1969 fmt_list = video_info.get('fmt_list', [''])[0]
1970 if fmt_list:
1971 for fmt in fmt_list.split(','):
1972 spec = fmt.split('/')
1973 if len(spec) > 1:
1974 width_height = spec[1].split('x')
1975 if len(width_height) == 2:
1976 formats_spec[spec[0]] = {
1977 'resolution': spec[1],
1978 'width': int_or_none(width_height[0]),
1979 'height': int_or_none(width_height[1]),
1980 }
1981 for fmt in streaming_formats:
1982 itag = str_or_none(fmt.get('itag'))
1983 if not itag:
1984 continue
1985 quality = fmt.get('quality')
1986 quality_label = fmt.get('qualityLabel') or quality
1987 formats_spec[itag] = {
1988 'asr': int_or_none(fmt.get('audioSampleRate')),
1989 'filesize': int_or_none(fmt.get('contentLength')),
1990 'format_note': quality_label,
1991 'fps': int_or_none(fmt.get('fps')),
1992 'height': int_or_none(fmt.get('height')),
1993 # bitrate for itag 43 is always 2147483647
1994 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
1995 'width': int_or_none(fmt.get('width')),
1996 }
1997
1998 for fmt in streaming_formats:
1999 if fmt.get('drmFamilies') or fmt.get('drm_families'):
2000 continue
2001 url = url_or_none(fmt.get('url'))
2002
2003 if not url:
2004 cipher = fmt.get('cipher') or fmt.get('signatureCipher')
2005 if not cipher:
2006 continue
2007 url_data = compat_parse_qs(cipher)
2008 url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
2009 if not url:
2010 continue
2011 else:
2012 cipher = None
2013 url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2014
2015 stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
2016 # Unsupported FORMAT_STREAM_TYPE_OTF
2017 if stream_type == 3:
2018 continue
2019
2020 format_id = fmt.get('itag') or url_data['itag'][0]
2021 if not format_id:
2022 continue
2023 format_id = compat_str(format_id)
2024
2025 if cipher:
2026 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
2027 ASSETS_RE = (
2028 r'<script[^>]+\bsrc=("[^"]+")[^>]+\bname=["\']player_ias/base',
2029 r'"jsUrl"\s*:\s*("[^"]+")',
2030 r'"assets":.+?"js":\s*("[^"]+")')
2031 jsplayer_url_json = self._search_regex(
2032 ASSETS_RE,
2033 embed_webpage if age_gate else video_webpage,
2034 'JS player URL (1)', default=None)
2035 if not jsplayer_url_json and not age_gate:
2036 # We need the embed website after all
2037 if embed_webpage is None:
2038 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
2039 embed_webpage = self._download_webpage(
2040 embed_url, video_id, 'Downloading embed webpage')
2041 jsplayer_url_json = self._search_regex(
2042 ASSETS_RE, embed_webpage, 'JS player URL')
2043
2044 player_url = json.loads(jsplayer_url_json)
2045 if player_url is None:
2046 player_url_json = self._search_regex(
2047 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
2048 video_webpage, 'age gate player URL')
2049 player_url = json.loads(player_url_json)
2050
2051 if 'sig' in url_data:
2052 url += '&signature=' + url_data['sig'][0]
2053 elif 's' in url_data:
2054 encrypted_sig = url_data['s'][0]
2055
2056 if self._downloader.params.get('verbose'):
2057 if player_url is None:
2058 player_desc = 'unknown'
2059 else:
2060 player_type, player_version = self._extract_player_info(player_url)
2061 player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version)
2062 parts_sizes = self._signature_cache_id(encrypted_sig)
2063 self.to_screen('{%s} signature length %s, %s' %
2064 (format_id, parts_sizes, player_desc))
2065
2066 signature = self._decrypt_signature(
2067 encrypted_sig, video_id, player_url, age_gate)
2068 sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
2069 url += '&%s=%s' % (sp, signature)
2070 if 'ratebypass' not in url:
2071 url += '&ratebypass=yes'
2072
2073 dct = {
2074 'format_id': format_id,
2075 'url': url,
2076 'player_url': player_url,
2077 }
2078 if format_id in self._formats:
2079 dct.update(self._formats[format_id])
2080 if format_id in formats_spec:
2081 dct.update(formats_spec[format_id])
2082
2083 # Some itags are not included in DASH manifest thus corresponding formats will
2084 # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).
2085 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
2086 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
2087 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
2088
2089 if width is None:
2090 width = int_or_none(fmt.get('width'))
2091 if height is None:
2092 height = int_or_none(fmt.get('height'))
2093
2094 filesize = int_or_none(url_data.get(
2095 'clen', [None])[0]) or _extract_filesize(url)
2096
2097 quality = url_data.get('quality', [None])[0] or fmt.get('quality')
2098 quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
2099
2100 tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
2101 or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
2102 fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
2103
2104 more_fields = {
2105 'filesize': filesize,
2106 'tbr': tbr,
2107 'width': width,
2108 'height': height,
2109 'fps': fps,
2110 'format_note': quality_label or quality,
2111 }
2112 for key, value in more_fields.items():
2113 if value:
2114 dct[key] = value
2115 type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
2116 if type_:
2117 type_split = type_.split(';')
2118 kind_ext = type_split[0].split('/')
2119 if len(kind_ext) == 2:
2120 kind, _ = kind_ext
2121 dct['ext'] = mimetype2ext(type_split[0])
2122 if kind in ('audio', 'video'):
2123 codecs = None
2124 for mobj in re.finditer(
2125 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
2126 if mobj.group('key') == 'codecs':
2127 codecs = mobj.group('val')
2128 break
2129 if codecs:
2130 dct.update(parse_codecs(codecs))
2131 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
2132 dct['downloader_options'] = {
2133 # Youtube throttles chunks >~10M
2134 'http_chunk_size': 10485760,
2135 }
2136 formats.append(dct)
2137 else:
2138 manifest_url = (
2139 url_or_none(try_get(
2140 player_response,
2141 lambda x: x['streamingData']['hlsManifestUrl'],
2142 compat_str))
2143 or url_or_none(try_get(
2144 video_info, lambda x: x['hlsvp'][0], compat_str)))
2145 if manifest_url:
2146 formats = []
2147 m3u8_formats = self._extract_m3u8_formats(
2148 manifest_url, video_id, 'mp4', fatal=False)
2149 for a_format in m3u8_formats:
2150 itag = self._search_regex(
2151 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
2152 if itag:
2153 a_format['format_id'] = itag
2154 if itag in self._formats:
2155 dct = self._formats[itag].copy()
2156 dct.update(a_format)
2157 a_format = dct
2158 a_format['player_url'] = player_url
2159 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
2160 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
2161 if self._downloader.params.get('youtube_include_hls_manifest', True):
2162 formats.append(a_format)
2163 else:
2164 error_message = extract_unavailable_message()
2165 if not error_message:
2166 reason_list = try_get(
2167 player_response,
2168 lambda x: x['playabilityStatus']['errorScreen']['playerErrorMessageRenderer']['subreason']['runs'],
2169 list) or []
2170 for reason in reason_list:
2171 if not isinstance(reason, dict):
2172 continue
2173 reason_text = try_get(reason, lambda x: x['text'], compat_str)
2174 if reason_text:
2175 if not error_message:
2176 error_message = ''
2177 error_message += reason_text
2178 if error_message:
2179 error_message = clean_html(error_message)
2180 if not error_message:
2181 error_message = clean_html(try_get(
2182 player_response, lambda x: x['playabilityStatus']['reason'],
2183 compat_str))
2184 if not error_message:
2185 error_message = clean_html(
2186 try_get(video_info, lambda x: x['reason'][0], compat_str))
2187 if error_message:
2188 raise ExtractorError(error_message, expected=True)
2189 raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
2190
2191 # uploader
2192 video_uploader = try_get(
2193 video_info, lambda x: x['author'][0],
2194 compat_str) or str_or_none(video_details.get('author'))
2195 if video_uploader:
2196 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
2197 else:
2198 self._downloader.report_warning('unable to extract uploader name')
2199
2200 # uploader_id
2201 video_uploader_id = None
2202 video_uploader_url = None
2203 mobj = re.search(
2204 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
2205 video_webpage)
2206 if mobj is not None:
2207 video_uploader_id = mobj.group('uploader_id')
2208 video_uploader_url = mobj.group('uploader_url')
2209 else:
2210 owner_profile_url = url_or_none(microformat.get('ownerProfileUrl'))
2211 if owner_profile_url:
2212 video_uploader_id = self._search_regex(
2213 r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id',
2214 default=None)
2215 video_uploader_url = owner_profile_url
2216
2217 channel_id = (
2218 str_or_none(video_details.get('channelId'))
2219 or self._html_search_meta(
2220 'channelId', video_webpage, 'channel id', default=None)
2221 or self._search_regex(
2222 r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
2223 video_webpage, 'channel id', default=None, group='id'))
2224 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
2225
2226 thumbnails = []
2227 thumbnails_list = try_get(
2228 video_details, lambda x: x['thumbnail']['thumbnails'], list) or []
2229 for t in thumbnails_list:
2230 if not isinstance(t, dict):
2231 continue
2232 thumbnail_url = url_or_none(t.get('url'))
2233 if not thumbnail_url:
2234 continue
2235 thumbnails.append({
2236 'url': thumbnail_url,
2237 'width': int_or_none(t.get('width')),
2238 'height': int_or_none(t.get('height')),
2239 })
2240
2241 if not thumbnails:
2242 video_thumbnail = None
2243 # We try first to get a high quality image:
2244 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
2245 video_webpage, re.DOTALL)
2246 if m_thumb is not None:
2247 video_thumbnail = m_thumb.group(1)
2248 thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str)
2249 if thumbnail_url:
2250 video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url)
2251 if video_thumbnail:
2252 thumbnails.append({'url': video_thumbnail})
2253
2254 # upload date
2255 upload_date = self._html_search_meta(
2256 'datePublished', video_webpage, 'upload date', default=None)
2257 if not upload_date:
2258 upload_date = self._search_regex(
2259 [r'(?s)id="eow-date.*?>(.*?)</span>',
2260 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
2261 video_webpage, 'upload date', default=None)
2262 if not upload_date:
2263 upload_date = microformat.get('publishDate') or microformat.get('uploadDate')
2264 upload_date = unified_strdate(upload_date)
2265
2266 video_license = self._html_search_regex(
2267 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
2268 video_webpage, 'license', default=None)
2269
2270 m_music = re.search(
2271 r'''(?x)
2272 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
2273 <ul[^>]*>\s*
2274 <li>(?P<title>.+?)
2275 by (?P<creator>.+?)
2276 (?:
2277 \(.+?\)|
2278 <a[^>]*
2279 (?:
2280 \bhref=["\']/red[^>]*>| # drop possible
2281 >\s*Listen ad-free with YouTube Red # YouTube Red ad
2282 )
2283 .*?
2284 )?</li
2285 ''',
2286 video_webpage)
2287 if m_music:
2288 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
2289 video_creator = clean_html(m_music.group('creator'))
2290 else:
2291 video_alt_title = video_creator = None
2292
2293 def extract_meta(field):
2294 return self._html_search_regex(
2295 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
2296 video_webpage, field, default=None)
2297
2298 track = extract_meta('Song')
2299 artist = extract_meta('Artist')
2300 album = extract_meta('Album')
2301
2302 # Youtube Music Auto-generated description
2303 release_date = release_year = None
2304 if video_description:
2305 mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description)
2306 if mobj:
2307 if not track:
2308 track = mobj.group('track').strip()
2309 if not artist:
2310 artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))
2311 if not album:
2312 album = mobj.group('album'.strip())
2313 release_year = mobj.group('release_year')
2314 release_date = mobj.group('release_date')
2315 if release_date:
2316 release_date = release_date.replace('-', '')
2317 if not release_year:
2318 release_year = int(release_date[:4])
2319 if release_year:
2320 release_year = int(release_year)
2321
2322 yt_initial_data = self._extract_yt_initial_data(video_id, video_webpage)
2323 contents = try_get(yt_initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list) or []
2324 for content in contents:
2325 rows = try_get(content, lambda x: x['videoSecondaryInfoRenderer']['metadataRowContainer']['metadataRowContainerRenderer']['rows'], list) or []
2326 multiple_songs = False
2327 for row in rows:
2328 if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True:
2329 multiple_songs = True
2330 break
2331 for row in rows:
2332 mrr = row.get('metadataRowRenderer') or {}
2333 mrr_title = try_get(
2334 mrr, lambda x: x['title']['simpleText'], compat_str)
2335 mrr_contents = try_get(
2336 mrr, lambda x: x['contents'][0], dict) or {}
2337 mrr_contents_text = try_get(mrr_contents, [lambda x: x['simpleText'], lambda x: x['runs'][0]['text']], compat_str)
2338 if not (mrr_title and mrr_contents_text):
2339 continue
2340 if mrr_title == 'License':
2341 video_license = mrr_contents_text
2342 elif not multiple_songs:
2343 if mrr_title == 'Album':
2344 album = mrr_contents_text
2345 elif mrr_title == 'Artist':
2346 artist = mrr_contents_text
2347 elif mrr_title == 'Song':
2348 track = mrr_contents_text
2349
2350 m_episode = re.search(
2351 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
2352 video_webpage)
2353 if m_episode:
2354 series = unescapeHTML(m_episode.group('series'))
2355 season_number = int(m_episode.group('season'))
2356 episode_number = int(m_episode.group('episode'))
2357 else:
2358 series = season_number = episode_number = None
2359
2360 m_cat_container = self._search_regex(
2361 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
2362 video_webpage, 'categories', default=None)
2363 category = None
2364 if m_cat_container:
2365 category = self._html_search_regex(
2366 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
2367 default=None)
2368 if not category:
2369 category = try_get(
2370 microformat, lambda x: x['category'], compat_str)
2371 video_categories = None if category is None else [category]
2372
2373 video_tags = [
2374 unescapeHTML(m.group('content'))
2375 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
2376 if not video_tags:
2377 video_tags = try_get(video_details, lambda x: x['keywords'], list)
2378
2379 def _extract_count(count_name):
2380 return str_to_int(self._search_regex(
2381 (r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' % re.escape(count_name),
2382 r'["\']label["\']\s*:\s*["\']([\d,.]+)\s+%ss["\']' % re.escape(count_name)),
2383 video_webpage, count_name, default=None))
2384
2385 like_count = _extract_count('like')
2386 dislike_count = _extract_count('dislike')
2387
2388 if view_count is None:
2389 view_count = str_to_int(self._search_regex(
2390 r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
2391 'view count', default=None))
2392
2393 average_rating = (
2394 float_or_none(video_details.get('averageRating'))
2395 or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
2396
2397 # subtitles
2398 video_subtitles = self.extract_subtitles(
2399 video_id, video_webpage, has_live_chat_replay)
2400 automatic_captions = self.extract_automatic_captions(video_id, player_response, ytplayer_config)
2401
2402 video_duration = try_get(
2403 video_info, lambda x: int_or_none(x['length_seconds'][0]))
2404 if not video_duration:
2405 video_duration = int_or_none(video_details.get('lengthSeconds'))
2406 if not video_duration:
2407 video_duration = parse_duration(self._html_search_meta(
2408 'duration', video_webpage, 'video duration'))
2409
2410 # Get Subscriber Count of channel
2411 subscriber_count = parse_count(self._search_regex(
2412 r'"text":"([\d\.]+\w?) subscribers"',
2413 video_webpage,
2414 'subscriber count',
2415 default=None
2416 ))
2417
2418 # get xsrf for annotations or comments
2419 get_annotations = self._downloader.params.get('writeannotations', False)
2420 get_comments = self._downloader.params.get('getcomments', False)
2421 if get_annotations or get_comments:
2422 xsrf_token = None
2423 ytcfg = self._extract_ytcfg(video_id, video_webpage)
2424 if ytcfg:
2425 xsrf_token = try_get(ytcfg, lambda x: x['XSRF_TOKEN'], compat_str)
2426 if not xsrf_token:
2427 xsrf_token = self._search_regex(
2428 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>(?:(?!\2).)+)\2',
2429 video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
2430
2431 # annotations
2432 video_annotations = None
2433 if get_annotations:
2434 invideo_url = try_get(
2435 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
2436 if xsrf_token and invideo_url:
2437 xsrf_field_name = None
2438 if ytcfg:
2439 xsrf_field_name = try_get(ytcfg, lambda x: x['XSRF_FIELD_NAME'], compat_str)
2440 if not xsrf_field_name:
2441 xsrf_field_name = self._search_regex(
2442 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
2443 video_webpage, 'xsrf field name',
2444 group='xsrf_field_name', default='session_token')
2445 video_annotations = self._download_webpage(
2446 self._proto_relative_url(invideo_url),
2447 video_id, note='Downloading annotations',
2448 errnote='Unable to download video annotations', fatal=False,
2449 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
2450
2451 chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
2452
2453 # Get comments
2454 # TODO: Refactor and move to seperate function
2455 if get_comments:
2456 expected_video_comment_count = 0
2457 video_comments = []
2458
2459 def find_value(html, key, num_chars=2, separator='"'):
2460 pos_begin = html.find(key) + len(key) + num_chars
2461 pos_end = html.find(separator, pos_begin)
2462 return html[pos_begin: pos_end]
2463
2464 def search_dict(partial, key):
2465 if isinstance(partial, dict):
2466 for k, v in partial.items():
2467 if k == key:
2468 yield v
2469 else:
2470 for o in search_dict(v, key):
2471 yield o
2472 elif isinstance(partial, list):
2473 for i in partial:
2474 for o in search_dict(i, key):
2475 yield o
2476
2477 try:
2478 ncd = next(search_dict(yt_initial_data, 'nextContinuationData'))
2479 continuations = [ncd['continuation']]
2480 # Handle videos where comments have been disabled entirely
2481 except StopIteration:
2482 continuations = []
2483
2484 def get_continuation(continuation, session_token, replies=False):
2485 query = {
2486 'pbj': 1,
2487 'ctoken': continuation,
2488 }
2489 if replies:
2490 query['action_get_comment_replies'] = 1
2491 else:
2492 query['action_get_comments'] = 1
2493
2494 while True:
2495 content, handle = self._download_webpage_handle(
2496 'https://www.youtube.com/comment_service_ajax',
2497 video_id,
2498 note=False,
2499 expected_status=[413],
2500 data=urlencode_postdata({
2501 'session_token': session_token
2502 }),
2503 query=query,
2504 headers={
2505 'Accept': '*/*',
2506 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0',
2507 'X-YouTube-Client-Name': '1',
2508 'X-YouTube-Client-Version': '2.20201202.06.01'
2509 }
2510 )
2511
2512 response_code = handle.getcode()
2513 if (response_code == 200):
2514 return self._parse_json(content, video_id)
2515 if (response_code == 413):
2516 return None
2517 raise ExtractorError('Unexpected HTTP error code: %s' % response_code)
2518
2519 first_continuation = True
2520 while continuations:
2521 continuation, itct = continuations.pop()
2522 comment_response = get_continuation(continuation, xsrf_token)
2523 if not comment_response:
2524 continue
2525 if list(search_dict(comment_response, 'externalErrorMessage')):
2526 raise ExtractorError('Error returned from server: ' + next(search_dict(comment_response, 'externalErrorMessage')))
2527
2528 if 'continuationContents' not in comment_response['response']:
2529 # Something is wrong here. Youtube won't accept this continuation token for some reason and responds with a user satisfaction dialog (error?)
2530 continue
2531 # not sure if this actually helps
2532 if 'xsrf_token' in comment_response:
2533 xsrf_token = comment_response['xsrf_token']
2534
2535 item_section = comment_response['response']['continuationContents']['itemSectionContinuation']
2536 if first_continuation:
2537 expected_video_comment_count = int(item_section['header']['commentsHeaderRenderer']['countText']['runs'][0]['text'].replace(' Comments', '').replace('1 Comment', '1').replace(',', ''))
2538 first_continuation = False
2539 if 'contents' not in item_section:
2540 # continuation returned no comments?
2541 # set an empty array as to not break the for loop
2542 item_section['contents'] = []
2543
2544 for meta_comment in item_section['contents']:
2545 comment = meta_comment['commentThreadRenderer']['comment']['commentRenderer']
2546 video_comments.append({
2547 'id': comment['commentId'],
2548 'text': ''.join([c['text'] for c in comment['contentText']['runs']]),
2549 'time_text': ''.join([c['text'] for c in comment['publishedTimeText']['runs']]),
2550 'author': comment.get('authorText', {}).get('simpleText', ''),
2551 'votes': comment.get('voteCount', {}).get('simpleText', '0'),
2552 'author_thumbnail': comment['authorThumbnail']['thumbnails'][-1]['url'],
2553 'parent': 'root'
2554 })
2555 if 'replies' not in meta_comment['commentThreadRenderer']:
2556 continue
2557
2558 reply_continuations = [rcn['nextContinuationData']['continuation'] for rcn in meta_comment['commentThreadRenderer']['replies']['commentRepliesRenderer']['continuations']]
2559 while reply_continuations:
2560 time.sleep(1)
2561 continuation = reply_continuations.pop()
2562 replies_data = get_continuation(continuation, xsrf_token, True)
2563 if not replies_data or 'continuationContents' not in replies_data[1]['response']:
2564 continue
2565
2566 if self._downloader.params.get('verbose', False):
2567 self.to_screen('[debug] Comments downloaded (chain %s) %s of ~%s' % (comment['commentId'], len(video_comments), expected_video_comment_count))
2568 reply_comment_meta = replies_data[1]['response']['continuationContents']['commentRepliesContinuation']
2569 for reply_meta in replies_data[1]['response']['continuationContents']['commentRepliesContinuation']['contents']:
2570 reply_comment = reply_meta['commentRenderer']
2571 video_comments.append({
2572 'id': reply_comment['commentId'],
2573 'text': ''.join([c['text'] for c in reply_comment['contentText']['runs']]),
2574 'time_text': ''.join([c['text'] for c in reply_comment['publishedTimeText']['runs']]),
2575 'author': reply_comment.get('authorText', {}).get('simpleText', ''),
2576 'votes': reply_comment.get('voteCount', {}).get('simpleText', '0'),
2577 'author_thumbnail': reply_comment['authorThumbnail']['thumbnails'][-1]['url'],
2578 'parent': comment['commentId']
2579 })
2580 if 'continuations' not in reply_comment_meta or len(reply_comment_meta['continuations']) == 0:
2581 continue
2582
2583 reply_continuations += [rcn['nextContinuationData']['continuation'] for rcn in reply_comment_meta['continuations']]
2584
2585 self.to_screen('Comments downloaded %s of ~%s' % (len(video_comments), expected_video_comment_count))
2586
2587 if 'continuations' in item_section:
2588 continuations += [ncd['nextContinuationData']['continuation'] for ncd in item_section['continuations']]
2589 time.sleep(1)
2590
2591 self.to_screen('Total comments downloaded %s of ~%s' % (len(video_comments), expected_video_comment_count))
2592 else:
2593 expected_video_comment_count = None
2594 video_comments = None
2595
2596 # Look for the DASH manifest
2597 if self._downloader.params.get('youtube_include_dash_manifest', True):
2598 dash_mpd_fatal = True
2599 for mpd_url in dash_mpds:
2600 dash_formats = {}
2601 try:
2602 def decrypt_sig(mobj):
2603 s = mobj.group(1)
2604 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2605 return '/signature/%s' % dec_s
2606
2607 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2608
2609 for df in self._extract_mpd_formats(
2610 mpd_url, video_id, fatal=dash_mpd_fatal,
2611 formats_dict=self._formats):
2612 if not df.get('filesize'):
2613 df['filesize'] = _extract_filesize(df['url'])
2614 # Do not overwrite DASH format found in some previous DASH manifest
2615 if df['format_id'] not in dash_formats:
2616 dash_formats[df['format_id']] = df
2617 # Additional DASH manifests may end up in HTTP Error 403 therefore
2618 # allow them to fail without bug report message if we already have
2619 # some DASH manifest succeeded. This is temporary workaround to reduce
2620 # burst of bug reports until we figure out the reason and whether it
2621 # can be fixed at all.
2622 dash_mpd_fatal = False
2623 except (ExtractorError, KeyError) as e:
2624 self.report_warning(
2625 'Skipping DASH manifest: %r' % e, video_id)
2626 if dash_formats:
2627 # Remove the formats we found through non-DASH, they
2628 # contain less info and it can be wrong, because we use
2629 # fixed values (for example the resolution). See
2630 # https://github.com/ytdl-org/youtube-dl/issues/5774 for an
2631 # example.
2632 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
2633 formats.extend(dash_formats.values())
2634
2635 # Check for malformed aspect ratio
2636 stretched_m = re.search(
2637 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2638 video_webpage)
2639 if stretched_m:
2640 w = float(stretched_m.group('w'))
2641 h = float(stretched_m.group('h'))
2642 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2643 # We will only process correct ratios.
2644 if w > 0 and h > 0:
2645 ratio = w / h
2646 for f in formats:
2647 if f.get('vcodec') != 'none':
2648 f['stretched_ratio'] = ratio
2649
2650 if not formats:
2651 if 'reason' in video_info:
2652 if 'The uploader has not made this video available in your country.' in video_info['reason']:
2653 regions_allowed = self._html_search_meta(
2654 'regionsAllowed', video_webpage, default=None)
2655 countries = regions_allowed.split(',') if regions_allowed else None
2656 self.raise_geo_restricted(
2657 msg=video_info['reason'][0], countries=countries)
2658 reason = video_info['reason'][0]
2659 if 'Invalid parameters' in reason:
2660 unavailable_message = extract_unavailable_message()
2661 if unavailable_message:
2662 reason = unavailable_message
2663 raise ExtractorError(
2664 'YouTube said: %s' % reason,
2665 expected=True, video_id=video_id)
2666 if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']):
2667 raise ExtractorError('This video is DRM protected.', expected=True)
2668
2669 self._sort_formats(formats)
2670
2671 self.mark_watched(video_id, video_info, player_response)
2672
2673 return {
2674 'id': video_id,
2675 'uploader': video_uploader,
2676 'uploader_id': video_uploader_id,
2677 'uploader_url': video_uploader_url,
2678 'channel_id': channel_id,
2679 'channel_url': channel_url,
2680 'upload_date': upload_date,
2681 'license': video_license,
2682 'creator': video_creator or artist,
2683 'title': video_title,
2684 'alt_title': video_alt_title or track,
2685 'thumbnails': thumbnails,
2686 'description': video_description,
2687 'categories': video_categories,
2688 'tags': video_tags,
2689 'subtitles': video_subtitles,
2690 'automatic_captions': automatic_captions,
2691 'duration': video_duration,
2692 'age_limit': 18 if age_gate else 0,
2693 'annotations': video_annotations,
2694 'chapters': chapters,
2695 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
2696 'view_count': view_count,
2697 'like_count': like_count,
2698 'dislike_count': dislike_count,
2699 'average_rating': average_rating,
2700 'formats': formats,
2701 'is_live': is_live,
2702 'start_time': start_time,
2703 'end_time': end_time,
2704 'series': series,
2705 'season_number': season_number,
2706 'episode_number': episode_number,
2707 'track': track,
2708 'artist': artist,
2709 'album': album,
2710 'release_date': release_date,
2711 'release_year': release_year,
2712 'subscriber_count': subscriber_count,
2713 'playable_in_embed': playable_in_embed,
2714 'comments': video_comments,
2715 'comment_count': expected_video_comment_count,
2716 }
2717
2718
2719class YoutubeTabIE(YoutubeBaseInfoExtractor):
2720 IE_DESC = 'YouTube.com tab'
2721 _VALID_URL = r'''(?x)
2722 https?://
2723 (?:\w+\.)?
2724 (?:
2725 youtube(?:kids)?\.com|
2726 invidio\.us
2727 )/
2728 (?:
2729 (?:channel|c|user)/|
2730 (?P<not_channel>
2731 feed/|
2732 (?:playlist|watch)\?.*?\blist=
2733 )|
2734 (?!(?:%s)\b) # Direct URLs
2735 )
2736 (?P<id>[^/?\#&]+)
2737 ''' % YoutubeBaseInfoExtractor._RESERVED_NAMES
2738 IE_NAME = 'youtube:tab'
2739
2740 _TESTS = [{
2741 # playlists, multipage
2742 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid',
2743 'playlist_mincount': 94,
2744 'info_dict': {
2745 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2746 'title': 'Игорь Клейнер - Playlists',
2747 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
2748 },
2749 }, {
2750 # playlists, multipage, different order
2751 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
2752 'playlist_mincount': 94,
2753 'info_dict': {
2754 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2755 'title': 'Игорь Клейнер - Playlists',
2756 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
2757 },
2758 }, {
2759 # playlists, singlepage
2760 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
2761 'playlist_mincount': 4,
2762 'info_dict': {
2763 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
2764 'title': 'ThirstForScience - Playlists',
2765 'description': 'md5:609399d937ea957b0f53cbffb747a14c',
2766 }
2767 }, {
2768 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
2769 'only_matching': True,
2770 }, {
2771 # basic, single video playlist
2772 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2773 'info_dict': {
2774 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2775 'uploader': 'Sergey M.',
2776 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2777 'title': 'youtube-dl public playlist',
2778 },
2779 'playlist_count': 1,
2780 }, {
2781 # empty playlist
2782 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2783 'info_dict': {
2784 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2785 'uploader': 'Sergey M.',
2786 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2787 'title': 'youtube-dl empty playlist',
2788 },
2789 'playlist_count': 0,
2790 }, {
2791 # Home tab
2792 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured',
2793 'info_dict': {
2794 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2795 'title': 'lex will - Home',
2796 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2797 },
2798 'playlist_mincount': 2,
2799 }, {
2800 # Videos tab
2801 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos',
2802 'info_dict': {
2803 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2804 'title': 'lex will - Videos',
2805 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2806 },
2807 'playlist_mincount': 975,
2808 }, {
2809 # Videos tab, sorted by popular
2810 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid',
2811 'info_dict': {
2812 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2813 'title': 'lex will - Videos',
2814 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2815 },
2816 'playlist_mincount': 199,
2817 }, {
2818 # Playlists tab
2819 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists',
2820 'info_dict': {
2821 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2822 'title': 'lex will - Playlists',
2823 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2824 },
2825 'playlist_mincount': 17,
2826 }, {
2827 # Community tab
2828 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community',
2829 'info_dict': {
2830 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2831 'title': 'lex will - Community',
2832 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2833 },
2834 'playlist_mincount': 18,
2835 }, {
2836 # Channels tab
2837 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels',
2838 'info_dict': {
2839 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2840 'title': 'lex will - Channels',
2841 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2842 },
2843 'playlist_mincount': 138,
2844 }, {
2845 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA',
2846 'only_matching': True,
2847 }, {
2848 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
2849 'only_matching': True,
2850 }, {
2851 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
2852 'only_matching': True,
2853 }, {
2854 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2855 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2856 'info_dict': {
2857 'title': '29C3: Not my department',
2858 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2859 'uploader': 'Christiaan008',
2860 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
2861 },
2862 'playlist_count': 96,
2863 }, {
2864 'note': 'Large playlist',
2865 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
2866 'info_dict': {
2867 'title': 'Uploads from Cauchemar',
2868 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
2869 'uploader': 'Cauchemar',
2870 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
2871 },
2872 'playlist_mincount': 1123,
2873 }, {
2874 # even larger playlist, 8832 videos
2875 'url': 'http://www.youtube.com/user/NASAgovVideo/videos',
2876 'only_matching': True,
2877 }, {
2878 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2879 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2880 'info_dict': {
2881 'title': 'Uploads from Interstellar Movie',
2882 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
2883 'uploader': 'Interstellar Movie',
2884 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
2885 },
2886 'playlist_mincount': 21,
2887 }, {
2888 # https://github.com/ytdl-org/youtube-dl/issues/21844
2889 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2890 'info_dict': {
2891 'title': 'Data Analysis with Dr Mike Pound',
2892 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2893 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
2894 'uploader': 'Computerphile',
2895 },
2896 'playlist_mincount': 11,
2897 }, {
2898 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2899 'only_matching': True,
2900 }, {
2901 # Playlist URL that does not actually serve a playlist
2902 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2903 'info_dict': {
2904 'id': 'FqZTN594JQw',
2905 'ext': 'webm',
2906 'title': "Smiley's People 01 detective, Adventure Series, Action",
2907 'uploader': 'STREEM',
2908 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
2909 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
2910 'upload_date': '20150526',
2911 'license': 'Standard YouTube License',
2912 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2913 'categories': ['People & Blogs'],
2914 'tags': list,
2915 'view_count': int,
2916 'like_count': int,
2917 'dislike_count': int,
2918 },
2919 'params': {
2920 'skip_download': True,
2921 },
2922 'skip': 'This video is not available.',
2923 'add_ie': [YoutubeIE.ie_key()],
2924 }, {
2925 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
2926 'only_matching': True,
2927 }, {
2928 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
2929 'only_matching': True,
2930 }, {
2931 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live',
2932 'info_dict': {
2933 'id': '9Auq9mYxFEE',
2934 'ext': 'mp4',
2935 'title': 'Watch Sky News live',
2936 'uploader': 'Sky News',
2937 'uploader_id': 'skynews',
2938 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews',
2939 'upload_date': '20191102',
2940 'description': 'md5:78de4e1c2359d0ea3ed829678e38b662',
2941 'categories': ['News & Politics'],
2942 'tags': list,
2943 'like_count': int,
2944 'dislike_count': int,
2945 },
2946 'params': {
2947 'skip_download': True,
2948 },
2949 }, {
2950 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
2951 'info_dict': {
2952 'id': 'a48o2S1cPoo',
2953 'ext': 'mp4',
2954 'title': 'The Young Turks - Live Main Show',
2955 'uploader': 'The Young Turks',
2956 'uploader_id': 'TheYoungTurks',
2957 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
2958 'upload_date': '20150715',
2959 'license': 'Standard YouTube License',
2960 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
2961 'categories': ['News & Politics'],
2962 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
2963 'like_count': int,
2964 'dislike_count': int,
2965 },
2966 'params': {
2967 'skip_download': True,
2968 },
2969 'only_matching': True,
2970 }, {
2971 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
2972 'only_matching': True,
2973 }, {
2974 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
2975 'only_matching': True,
2976 }, {
2977 'url': 'https://www.youtube.com/feed/trending',
2978 'only_matching': True,
2979 }, {
2980 # needs auth
2981 'url': 'https://www.youtube.com/feed/library',
2982 'only_matching': True,
2983 }, {
2984 # needs auth
2985 'url': 'https://www.youtube.com/feed/history',
2986 'only_matching': True,
2987 }, {
2988 # needs auth
2989 'url': 'https://www.youtube.com/feed/subscriptions',
2990 'only_matching': True,
2991 }, {
2992 # needs auth
2993 'url': 'https://www.youtube.com/feed/watch_later',
2994 'only_matching': True,
2995 }, {
2996 # no longer available?
2997 'url': 'https://www.youtube.com/feed/recommended',
2998 'only_matching': True,
2999 }, {
3000 # inline playlist with not always working continuations
3001 'url': 'https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C',
3002 'only_matching': True,
3003 }, {
3004 'url': 'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8',
3005 'only_matching': True,
3006 }, {
3007 'url': 'https://www.youtube.com/course',
3008 'only_matching': True,
3009 }, {
3010 'url': 'https://www.youtube.com/zsecurity',
3011 'only_matching': True,
3012 }, {
3013 'url': 'http://www.youtube.com/NASAgovVideo/videos',
3014 'only_matching': True,
3015 }, {
3016 'url': 'https://www.youtube.com/TheYoungTurks/live',
3017 'only_matching': True,
3018 }]
3019
3020 @classmethod
3021 def suitable(cls, url):
3022 return False if YoutubeIE.suitable(url) else super(
3023 YoutubeTabIE, cls).suitable(url)
3024
3025 def _extract_channel_id(self, webpage):
3026 channel_id = self._html_search_meta(
3027 'channelId', webpage, 'channel id', default=None)
3028 if channel_id:
3029 return channel_id
3030 channel_url = self._html_search_meta(
3031 ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url',
3032 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad',
3033 'twitter:app:url:googleplay'), webpage, 'channel url')
3034 return self._search_regex(
3035 r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+',
3036 channel_url, 'channel id')
3037
3038 @staticmethod
3039 def _extract_grid_item_renderer(item):
3040 for item_kind in ('Playlist', 'Video', 'Channel'):
3041 renderer = item.get('grid%sRenderer' % item_kind)
3042 if renderer:
3043 return renderer
3044
3045 def _grid_entries(self, grid_renderer):
3046 for item in grid_renderer['items']:
3047 if not isinstance(item, dict):
3048 continue
3049 renderer = self._extract_grid_item_renderer(item)
3050 if not isinstance(renderer, dict):
3051 continue
3052 title = try_get(
3053 renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
3054 # playlist
3055 playlist_id = renderer.get('playlistId')
3056 if playlist_id:
3057 yield self.url_result(
3058 'https://www.youtube.com/playlist?list=%s' % playlist_id,
3059 ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
3060 video_title=title)
3061 # video
3062 video_id = renderer.get('videoId')
3063 if video_id:
3064 yield self._extract_video(renderer)
3065 # channel
3066 channel_id = renderer.get('channelId')
3067 if channel_id:
3068 title = try_get(
3069 renderer, lambda x: x['title']['simpleText'], compat_str)
3070 yield self.url_result(
3071 'https://www.youtube.com/channel/%s' % channel_id,
3072 ie=YoutubeTabIE.ie_key(), video_title=title)
3073
3074 def _shelf_entries_from_content(self, shelf_renderer):
3075 content = shelf_renderer.get('content')
3076 if not isinstance(content, dict):
3077 return
3078 renderer = content.get('gridRenderer')
3079 if renderer:
3080 # TODO: add support for nested playlists so each shelf is processed
3081 # as separate playlist
3082 # TODO: this includes only first N items
3083 for entry in self._grid_entries(renderer):
3084 yield entry
3085 renderer = content.get('horizontalListRenderer')
3086 if renderer:
3087 # TODO
3088 pass
3089
3090 def _shelf_entries(self, shelf_renderer, skip_channels=False):
3091 ep = try_get(
3092 shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
3093 compat_str)
3094 shelf_url = urljoin('https://www.youtube.com', ep)
3095 if shelf_url:
3096 # Skipping links to another channels, note that checking for
3097 # endpoint.commandMetadata.webCommandMetadata.webPageTypwebPageType == WEB_PAGE_TYPE_CHANNEL
3098 # will not work
3099 if skip_channels and '/channels?' in shelf_url:
3100 return
3101 title = try_get(
3102 shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
3103 yield self.url_result(shelf_url, video_title=title)
3104 # Shelf may not contain shelf URL, fallback to extraction from content
3105 for entry in self._shelf_entries_from_content(shelf_renderer):
3106 yield entry
3107
3108 def _playlist_entries(self, video_list_renderer):
3109 for content in video_list_renderer['contents']:
3110 if not isinstance(content, dict):
3111 continue
3112 renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer')
3113 if not isinstance(renderer, dict):
3114 continue
3115 video_id = renderer.get('videoId')
3116 if not video_id:
3117 continue
3118 yield self._extract_video(renderer)
3119
3120 r""" # Not needed in the new implementation
3121 def _itemSection_entries(self, item_sect_renderer):
3122 for content in item_sect_renderer['contents']:
3123 if not isinstance(content, dict):
3124 continue
3125 renderer = content.get('videoRenderer', {})
3126 if not isinstance(renderer, dict):
3127 continue
3128 video_id = renderer.get('videoId')
3129 if not video_id:
3130 continue
3131 yield self._extract_video(renderer)
3132 """
3133
3134 def _rich_entries(self, rich_grid_renderer):
3135 renderer = try_get(
3136 rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) or {}
3137 video_id = renderer.get('videoId')
3138 if not video_id:
3139 return
3140 yield self._extract_video(renderer)
3141
3142 def _video_entry(self, video_renderer):
3143 video_id = video_renderer.get('videoId')
3144 if video_id:
3145 return self._extract_video(video_renderer)
3146
3147 def _post_thread_entries(self, post_thread_renderer):
3148 post_renderer = try_get(
3149 post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict)
3150 if not post_renderer:
3151 return
3152 # video attachment
3153 video_renderer = try_get(
3154 post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict)
3155 video_id = None
3156 if video_renderer:
3157 entry = self._video_entry(video_renderer)
3158 if entry:
3159 yield entry
3160 # inline video links
3161 runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or []
3162 for run in runs:
3163 if not isinstance(run, dict):
3164 continue
3165 ep_url = try_get(
3166 run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str)
3167 if not ep_url:
3168 continue
3169 if not YoutubeIE.suitable(ep_url):
3170 continue
3171 ep_video_id = YoutubeIE._match_id(ep_url)
3172 if video_id == ep_video_id:
3173 continue
3174 yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=video_id)
3175
3176 def _post_thread_continuation_entries(self, post_thread_continuation):
3177 contents = post_thread_continuation.get('contents')
3178 if not isinstance(contents, list):
3179 return
3180 for content in contents:
3181 renderer = content.get('backstagePostThreadRenderer')
3182 if not isinstance(renderer, dict):
3183 continue
3184 for entry in self._post_thread_entries(renderer):
3185 yield entry
3186
3187 @staticmethod
3188 def _build_continuation_query(continuation, ctp=None):
3189 query = {
3190 'ctoken': continuation,
3191 'continuation': continuation,
3192 }
3193 if ctp:
3194 query['itct'] = ctp
3195 return query
3196
3197 @staticmethod
3198 def _extract_next_continuation_data(renderer):
3199 next_continuation = try_get(
3200 renderer, lambda x: x['continuations'][0]['nextContinuationData'], dict)
3201 if not next_continuation:
3202 return
3203 continuation = next_continuation.get('continuation')
3204 if not continuation:
3205 return
3206 ctp = next_continuation.get('clickTrackingParams')
3207 return YoutubeTabIE._build_continuation_query(continuation, ctp)
3208
3209 @classmethod
3210 def _extract_continuation(cls, renderer):
3211 next_continuation = cls._extract_next_continuation_data(renderer)
3212 if next_continuation:
3213 return next_continuation
3214 contents = renderer.get('contents')
3215 if not isinstance(contents, list):
3216 return
3217 for content in contents:
3218 if not isinstance(content, dict):
3219 continue
3220 continuation_ep = try_get(
3221 content, lambda x: x['continuationItemRenderer']['continuationEndpoint'],
3222 dict)
3223 if not continuation_ep:
3224 continue
3225 continuation = try_get(
3226 continuation_ep, lambda x: x['continuationCommand']['token'], compat_str)
3227 if not continuation:
3228 continue
3229 ctp = continuation_ep.get('clickTrackingParams')
3230 return YoutubeTabIE._build_continuation_query(continuation, ctp)
3231
3232 def _entries(self, tab, identity_token):
3233
3234 def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds
3235 contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
3236 for content in contents:
3237 if not isinstance(content, dict):
3238 continue
3239 is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict)
3240 if not is_renderer:
3241 renderer = content.get('richItemRenderer')
3242 if renderer:
3243 for entry in self._rich_entries(renderer):
3244 yield entry
3245 continuation_list[0] = self._extract_continuation(parent_renderer)
3246 continue
3247 isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
3248 for isr_content in isr_contents:
3249 if not isinstance(isr_content, dict):
3250 continue
3251 renderer = isr_content.get('playlistVideoListRenderer')
3252 if renderer:
3253 for entry in self._playlist_entries(renderer):
3254 yield entry
3255 continuation_list[0] = self._extract_continuation(renderer)
3256 continue
3257 renderer = isr_content.get('gridRenderer')
3258 if renderer:
3259 for entry in self._grid_entries(renderer):
3260 yield entry
3261 continuation_list[0] = self._extract_continuation(renderer)
3262 continue
3263 renderer = isr_content.get('shelfRenderer')
3264 if renderer:
3265 is_channels_tab = tab.get('title') == 'Channels'
3266 for entry in self._shelf_entries(renderer, not is_channels_tab):
3267 yield entry
3268 continue
3269 renderer = isr_content.get('backstagePostThreadRenderer')
3270 if renderer:
3271 for entry in self._post_thread_entries(renderer):
3272 yield entry
3273 continuation_list[0] = self._extract_continuation(renderer)
3274 continue
3275 renderer = isr_content.get('videoRenderer')
3276 if renderer:
3277 entry = self._video_entry(renderer)
3278 if entry:
3279 yield entry
3280
3281 if not continuation_list[0]:
3282 continuation_list[0] = self._extract_continuation(is_renderer)
3283
3284 if not continuation_list[0]:
3285 continuation_list[0] = self._extract_continuation(parent_renderer)
3286
3287 continuation_list = [None] # Python 2 doesnot support nonlocal
3288 tab_content = try_get(tab, lambda x: x['content'], dict)
3289 if not tab_content:
3290 return
3291 parent_renderer = (
3292 try_get(tab_content, lambda x: x['sectionListRenderer'], dict)
3293 or try_get(tab_content, lambda x: x['richGridRenderer'], dict) or {})
3294 for entry in extract_entries(parent_renderer):
3295 yield entry
3296 continuation = continuation_list[0]
3297
3298 headers = {
3299 'x-youtube-client-name': '1',
3300 'x-youtube-client-version': '2.20201112.04.01',
3301 }
3302 if identity_token:
3303 headers['x-youtube-identity-token'] = identity_token
3304
3305 for page_num in itertools.count(1):
3306 if not continuation:
3307 break
3308 count = 0
3309 retries = 3
3310 while count <= retries:
3311 try:
3312 # Downloading page may result in intermittent 5xx HTTP error
3313 # that is usually worked around with a retry
3314 browse = self._download_json(
3315 'https://www.youtube.com/browse_ajax', None,
3316 'Downloading page %d%s'
3317 % (page_num, ' (retry #%d)' % count if count else ''),
3318 headers=headers, query=continuation)
3319 break
3320 except ExtractorError as e:
3321 if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
3322 count += 1
3323 if count <= retries:
3324 continue
3325 raise
3326 if not browse:
3327 break
3328 response = try_get(browse, lambda x: x[1]['response'], dict)
3329 if not response:
3330 break
3331
3332 continuation_contents = try_get(
3333 response, lambda x: x['continuationContents'], dict)
3334 if continuation_contents:
3335 continuation_renderer = continuation_contents.get('playlistVideoListContinuation')
3336 if continuation_renderer:
3337 for entry in self._playlist_entries(continuation_renderer):
3338 yield entry
3339 continuation = self._extract_continuation(continuation_renderer)
3340 continue
3341 continuation_renderer = continuation_contents.get('gridContinuation')
3342 if continuation_renderer:
3343 for entry in self._grid_entries(continuation_renderer):
3344 yield entry
3345 continuation = self._extract_continuation(continuation_renderer)
3346 continue
3347 continuation_renderer = continuation_contents.get('itemSectionContinuation')
3348 if continuation_renderer:
3349 for entry in self._post_thread_continuation_entries(continuation_renderer):
3350 yield entry
3351 continuation = self._extract_continuation(continuation_renderer)
3352 continue
3353 continuation_renderer = continuation_contents.get('sectionListContinuation') # for feeds
3354 if continuation_renderer:
3355 continuation_list = [None]
3356 for entry in extract_entries(continuation_renderer):
3357 yield entry
3358 continuation = continuation_list[0]
3359 continue
3360
3361 continuation_items = try_get(
3362 response, lambda x: x['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems'], list)
3363 if continuation_items:
3364 continuation_item = continuation_items[0]
3365 if not isinstance(continuation_item, dict):
3366 continue
3367 renderer = continuation_item.get('playlistVideoRenderer') or continuation_item.get('itemSectionRenderer')
3368 if renderer:
3369 video_list_renderer = {'contents': continuation_items}
3370 for entry in self._playlist_entries(video_list_renderer):
3371 yield entry
3372 continuation = self._extract_continuation(video_list_renderer)
3373 continue
3374 break
3375
3376 @staticmethod
3377 def _extract_selected_tab(tabs):
3378 for tab in tabs:
3379 if try_get(tab, lambda x: x['tabRenderer']['selected'], bool):
3380 return tab['tabRenderer']
3381 else:
3382 raise ExtractorError('Unable to find selected tab')
3383
3384 @staticmethod
3385 def _extract_uploader(data):
3386 uploader = {}
3387 sidebar_renderer = try_get(
3388 data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list)
3389 if sidebar_renderer:
3390 for item in sidebar_renderer:
3391 if not isinstance(item, dict):
3392 continue
3393 renderer = item.get('playlistSidebarSecondaryInfoRenderer')
3394 if not isinstance(renderer, dict):
3395 continue
3396 owner = try_get(
3397 renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict)
3398 if owner:
3399 uploader['uploader'] = owner.get('text')
3400 uploader['uploader_id'] = try_get(
3401 owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str)
3402 uploader['uploader_url'] = urljoin(
3403 'https://www.youtube.com/',
3404 try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
3405 return uploader
3406
3407 def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
3408 selected_tab = self._extract_selected_tab(tabs)
3409 renderer = try_get(
3410 data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
3411 playlist_id = title = description = None
3412 if renderer:
3413 channel_title = renderer.get('title') or item_id
3414 tab_title = selected_tab.get('title')
3415 title = channel_title or item_id
3416 if tab_title:
3417 title += ' - %s' % tab_title
3418 description = renderer.get('description')
3419 playlist_id = renderer.get('externalId')
3420
3421 # this has thumbnails, but there is currently no thumbnail field for playlists
3422 # sidebar.playlistSidebarRenderer has even more data, but its stucture is more complec
3423 renderer = try_get(
3424 data, lambda x: x['microformat']['microformatDataRenderer'], dict)
3425 if not renderer:
3426 renderer = try_get(
3427 data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
3428 if renderer:
3429 title = renderer.get('title')
3430 description = renderer.get('description')
3431 playlist_id = item_id
3432
3433 if playlist_id is None:
3434 playlist_id = item_id
3435 if title is None:
3436 title = "Youtube " + playlist_id.title()
3437 playlist = self.playlist_result(
3438 self._entries(selected_tab, identity_token),
3439 playlist_id=playlist_id, playlist_title=title,
3440 playlist_description=description)
3441 playlist.update(self._extract_uploader(data))
3442 return playlist
3443
3444 def _extract_from_playlist(self, item_id, url, data, playlist):
3445 title = playlist.get('title') or try_get(
3446 data, lambda x: x['titleText']['simpleText'], compat_str)
3447 playlist_id = playlist.get('playlistId') or item_id
3448 # Inline playlist rendition continuation does not always work
3449 # at Youtube side, so delegating regular tab-based playlist URL
3450 # processing whenever possible.
3451 playlist_url = urljoin(url, try_get(
3452 playlist, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
3453 compat_str))
3454 if playlist_url and playlist_url != url:
3455 return self.url_result(
3456 playlist_url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
3457 video_title=title)
3458 return self.playlist_result(
3459 self._playlist_entries(playlist), playlist_id=playlist_id,
3460 playlist_title=title)
3461
3462 @staticmethod
3463 def _extract_alerts(data):
3464 for alert_dict in try_get(data, lambda x: x['alerts'], list) or []:
3465 if not isinstance(alert_dict, dict):
3466 continue
3467 for renderer in alert_dict:
3468 alert = alert_dict[renderer]
3469 alert_type = alert.get('type')
3470 if not alert_type:
3471 continue
3472 message = try_get(alert, lambda x: x['text']['simpleText'], compat_str)
3473 if message:
3474 yield alert_type, message
3475 for run in try_get(alert, lambda x: x['text']['runs'], list) or []:
3476 message = try_get(run, lambda x: x['text'], compat_str)
3477 if message:
3478 yield alert_type, message
3479
3480 def _extract_identity_token(self, webpage, item_id):
3481 ytcfg = self._extract_ytcfg(item_id, webpage)
3482 if ytcfg:
3483 token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str)
3484 if token:
3485 return token
3486 return self._search_regex(
3487 r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
3488 'identity token', default=None)
3489
3490 def _real_extract(self, url):
3491 item_id = self._match_id(url)
3492 url = compat_urlparse.urlunparse(
3493 compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
3494 is_home = re.match(r'(?P<pre>%s)(?P<post>/?(?![^#?]).*$)' % self._VALID_URL, url)
3495 if is_home is not None and is_home.group('not_channel') is None and item_id != 'feed':
3496 self._downloader.report_warning(
3497 'A channel/user page was given. All the channel\'s videos will be downloaded. '
3498 'To download only the videos in the home page, add a "/featured" to the URL')
3499 url = '%s/videos%s' % (is_home.group('pre'), is_home.group('post') or '')
3500
3501 # Handle both video/playlist URLs
3502 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3503 video_id = qs.get('v', [None])[0]
3504 playlist_id = qs.get('list', [None])[0]
3505
3506 if is_home is not None and is_home.group('not_channel') is not None and is_home.group('not_channel').startswith('watch') and not video_id:
3507 if playlist_id:
3508 self._downloader.report_warning('%s is not a valid Youtube URL. Trying to download playlist %s' % (url, playlist_id))
3509 url = 'https://www.youtube.com/playlist?list=%s' % playlist_id
3510 # return self.url_result(playlist_id, ie=YoutubePlaylistIE.ie_key())
3511 else:
3512 raise ExtractorError('Unable to recognize tab page')
3513 if video_id and playlist_id:
3514 if self._downloader.params.get('noplaylist'):
3515 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
3516 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
3517 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
3518
3519 webpage = self._download_webpage(url, item_id)
3520 identity_token = self._extract_identity_token(webpage, item_id)
3521 data = self._extract_yt_initial_data(item_id, webpage)
3522 err_msg = None
3523 for alert_type, alert_message in self._extract_alerts(data):
3524 if alert_type.lower() == 'error':
3525 if err_msg:
3526 self._downloader.report_warning('YouTube said: %s - %s' % ('ERROR', err_msg))
3527 err_msg = alert_message
3528 else:
3529 self._downloader.report_warning('YouTube said: %s - %s' % (alert_type, alert_message))
3530 if err_msg:
3531 raise ExtractorError('YouTube said: %s' % err_msg, expected=True)
3532 tabs = try_get(
3533 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
3534 if tabs:
3535 return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token)
3536 playlist = try_get(
3537 data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
3538 if playlist:
3539 return self._extract_from_playlist(item_id, url, data, playlist)
3540 # Fallback to video extraction if no playlist alike page is recognized.
3541 # First check for the current video then try the v attribute of URL query.
3542 video_id = try_get(
3543 data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'],
3544 compat_str) or video_id
3545 if video_id:
3546 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
3547 # Failed to recognize
3548 raise ExtractorError('Unable to recognize tab page')
3549
3550
3551class YoutubePlaylistIE(InfoExtractor):
3552 IE_DESC = 'YouTube.com playlists'
3553 _VALID_URL = r'''(?x)(?:
3554 (?:https?://)?
3555 (?:\w+\.)?
3556 (?:
3557 (?:
3558 youtube(?:kids)?\.com|
3559 invidio\.us
3560 )
3561 /.*?\?.*?\blist=
3562 )?
3563 (?P<id>%(playlist_id)s)
3564 )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
3565 IE_NAME = 'youtube:playlist'
3566 _TESTS = [{
3567 'note': 'issue #673',
3568 'url': 'PLBB231211A4F62143',
3569 'info_dict': {
3570 'title': '[OLD]Team Fortress 2 (Class-based LP)',
3571 'id': 'PLBB231211A4F62143',
3572 'uploader': 'Wickydoo',
3573 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
3574 },
3575 'playlist_mincount': 29,
3576 }, {
3577 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
3578 'info_dict': {
3579 'title': 'YDL_safe_search',
3580 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
3581 },
3582 'playlist_count': 2,
3583 'skip': 'This playlist is private',
3584 }, {
3585 'note': 'embedded',
3586 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
3587 'playlist_count': 4,
3588 'info_dict': {
3589 'title': 'JODA15',
3590 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
3591 'uploader': 'milan',
3592 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
3593 }
3594 }, {
3595 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
3596 'playlist_mincount': 982,
3597 'info_dict': {
3598 'title': '2018 Chinese New Singles (11/6 updated)',
3599 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
3600 'uploader': 'LBK',
3601 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
3602 }
3603 }, {
3604 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
3605 'only_matching': True,
3606 }, {
3607 # music album playlist
3608 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
3609 'only_matching': True,
3610 }]
3611
3612 @classmethod
3613 def suitable(cls, url):
3614 return False if YoutubeTabIE.suitable(url) else super(
3615 YoutubePlaylistIE, cls).suitable(url)
3616
3617 def _real_extract(self, url):
3618 playlist_id = self._match_id(url)
3619 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3620 if not qs:
3621 qs = {'list': playlist_id}
3622 return self.url_result(
3623 update_url_query('https://www.youtube.com/playlist', qs),
3624 ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
3625
3626
3627class YoutubeYtBeIE(InfoExtractor):
3628 IE_DESC = 'youtu.be'
3629 _VALID_URL = r'https?://youtu\.be/(?P<id>[0-9A-Za-z_-]{11})/*?.*?\blist=(?P<playlist_id>%(playlist_id)s)' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
3630 _TESTS = [{
3631 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
3632 'info_dict': {
3633 'id': 'yeWKywCrFtk',
3634 'ext': 'mp4',
3635 'title': 'Small Scale Baler and Braiding Rugs',
3636 'uploader': 'Backus-Page House Museum',
3637 'uploader_id': 'backuspagemuseum',
3638 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
3639 'upload_date': '20161008',
3640 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
3641 'categories': ['Nonprofits & Activism'],
3642 'tags': list,
3643 'like_count': int,
3644 'dislike_count': int,
3645 },
3646 'params': {
3647 'noplaylist': True,
3648 'skip_download': True,
3649 },
3650 }, {
3651 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
3652 'only_matching': True,
3653 }]
3654
3655 def _real_extract(self, url):
3656 mobj = re.match(self._VALID_URL, url)
3657 video_id = mobj.group('id')
3658 playlist_id = mobj.group('playlist_id')
3659 return self.url_result(
3660 update_url_query('https://www.youtube.com/watch', {
3661 'v': video_id,
3662 'list': playlist_id,
3663 'feature': 'youtu.be',
3664 }), ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
3665
3666
3667class YoutubeYtUserIE(InfoExtractor):
3668 IE_DESC = 'YouTube.com user videos, URL or "ytuser" keyword'
3669 _VALID_URL = r'ytuser:(?P<id>.+)'
3670 _TESTS = [{
3671 'url': 'ytuser:phihag',
3672 'only_matching': True,
3673 }]
3674
3675 def _real_extract(self, url):
3676 user_id = self._match_id(url)
3677 return self.url_result(
3678 'https://www.youtube.com/user/%s' % user_id,
3679 ie=YoutubeTabIE.ie_key(), video_id=user_id)
3680
3681
3682class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
3683 IE_NAME = 'youtube:favorites'
3684 IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)'
3685 _VALID_URL = r':ytfav(?:ou?rite)?s?'
3686 _LOGIN_REQUIRED = True
3687 _TESTS = [{
3688 'url': ':ytfav',
3689 'only_matching': True,
3690 }, {
3691 'url': ':ytfavorites',
3692 'only_matching': True,
3693 }]
3694
3695 def _real_extract(self, url):
3696 return self.url_result(
3697 'https://www.youtube.com/playlist?list=LL',
3698 ie=YoutubeTabIE.ie_key())
3699
3700
3701class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):
3702 IE_DESC = 'YouTube.com searches'
3703 # there doesn't appear to be a real limit, for example if you search for
3704 # 'python' you get more than 8.000.000 results
3705 _MAX_RESULTS = float('inf')
3706 IE_NAME = 'youtube:search'
3707 _SEARCH_KEY = 'ytsearch'
3708 _SEARCH_PARAMS = None
3709 _TESTS = []
3710
3711 def _entries(self, query, n):
3712 data = {
3713 'context': {
3714 'client': {
3715 'clientName': 'WEB',
3716 'clientVersion': '2.20201021.03.00',
3717 }
3718 },
3719 'query': query,
3720 }
3721 if self._SEARCH_PARAMS:
3722 data['params'] = self._SEARCH_PARAMS
3723 total = 0
3724 for page_num in itertools.count(1):
3725 search = self._download_json(
3726 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
3727 video_id='query "%s"' % query,
3728 note='Downloading page %s' % page_num,
3729 errnote='Unable to download API page', fatal=False,
3730 data=json.dumps(data).encode('utf8'),
3731 headers={'content-type': 'application/json'})
3732 if not search:
3733 break
3734 slr_contents = try_get(
3735 search,
3736 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
3737 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
3738 list)
3739 if not slr_contents:
3740 break
3741
3742 # Youtube sometimes adds promoted content to searches,
3743 # changing the index location of videos and token.
3744 # So we search through all entries till we find them.
3745 continuation_token = None
3746 for slr_content in slr_contents:
3747 isr_contents = try_get(
3748 slr_content,
3749 lambda x: x['itemSectionRenderer']['contents'],
3750 list)
3751 if not isr_contents:
3752 continue
3753 for content in isr_contents:
3754 if not isinstance(content, dict):
3755 continue
3756 video = content.get('videoRenderer')
3757 if not isinstance(video, dict):
3758 continue
3759 video_id = video.get('videoId')
3760 if not video_id:
3761 continue
3762
3763 yield self._extract_video(video)
3764 total += 1
3765 if total == n:
3766 return
3767
3768 if continuation_token is None:
3769 continuation_token = try_get(
3770 slr_content,
3771 lambda x: x['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
3772 compat_str)
3773
3774 if not continuation_token:
3775 break
3776 data['continuation'] = continuation_token
3777
3778 def _get_n_results(self, query, n):
3779 """Get a specified number of results for a query"""
3780 return self.playlist_result(self._entries(query, n), query)
3781
3782
3783class YoutubeSearchDateIE(YoutubeSearchIE):
3784 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
3785 _SEARCH_KEY = 'ytsearchdate'
3786 IE_DESC = 'YouTube.com searches, newest videos first, "ytsearchdate" keyword'
3787 _SEARCH_PARAMS = 'CAI%3D'
3788
3789
3790class YoutubeSearchURLIE(YoutubeSearchIE):
3791 IE_DESC = 'YouTube.com searches, "ytsearch" keyword'
3792 IE_NAME = YoutubeSearchIE.IE_NAME + '_url'
3793 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)'
3794 # _MAX_RESULTS = 100
3795 _TESTS = [{
3796 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
3797 'playlist_mincount': 5,
3798 'info_dict': {
3799 'title': 'youtube-dl test video',
3800 }
3801 }, {
3802 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
3803 'only_matching': True,
3804 }]
3805
3806 @classmethod
3807 def _make_valid_url(cls):
3808 return cls._VALID_URL
3809
3810 def _real_extract(self, url):
3811 qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
3812 query = (qs.get('search_query') or qs.get('q'))[0]
3813 self._SEARCH_PARAMS = qs.get('sp', ('',))[0]
3814 return self._get_n_results(query, self._MAX_RESULTS)
3815
3816
3817class YoutubeFeedsInfoExtractor(YoutubeTabIE):
3818 """
3819 Base class for feed extractors
3820 Subclasses must define the _FEED_NAME property.
3821 """
3822 _LOGIN_REQUIRED = True
3823 # _MAX_PAGES = 5
3824 _TESTS = []
3825
3826 @property
3827 def IE_NAME(self):
3828 return 'youtube:%s' % self._FEED_NAME
3829
3830 def _real_initialize(self):
3831 self._login()
3832
3833 def _real_extract(self, url):
3834 return self.url_result(
3835 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
3836 ie=YoutubeTabIE.ie_key())
3837
3838
3839class YoutubeWatchLaterIE(InfoExtractor):
3840 IE_NAME = 'youtube:watchlater'
3841 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
3842 _VALID_URL = r':ytwatchlater'
3843 _TESTS = [{
3844 'url': ':ytwatchlater',
3845 'only_matching': True,
3846 }]
3847
3848 def _real_extract(self, url):
3849 return self.url_result(
3850 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key())
3851
3852
3853class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3854 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
3855 _VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?'
3856 _FEED_NAME = 'recommended'
3857 _TESTS = [{
3858 'url': ':ytrec',
3859 'only_matching': True,
3860 }, {
3861 'url': ':ytrecommended',
3862 'only_matching': True,
3863 }, {
3864 'url': 'https://youtube.com',
3865 'only_matching': True,
3866 }]
3867
3868
3869class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
3870 IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)'
3871 _VALID_URL = r':ytsub(?:scription)?s?'
3872 _FEED_NAME = 'subscriptions'
3873 _TESTS = [{
3874 'url': ':ytsubs',
3875 'only_matching': True,
3876 }, {
3877 'url': ':ytsubscriptions',
3878 'only_matching': True,
3879 }]
3880
3881
3882class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
3883 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
3884 _VALID_URL = r':ythistory'
3885 _FEED_NAME = 'history'
3886 _TESTS = [{
3887 'url': ':ythistory',
3888 'only_matching': True,
3889 }]
3890
3891
3892class YoutubeTruncatedURLIE(InfoExtractor):
3893 IE_NAME = 'youtube:truncated_url'
3894 IE_DESC = False # Do not list
3895 _VALID_URL = r'''(?x)
3896 (?:https?://)?
3897 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3898 (?:watch\?(?:
3899 feature=[a-z_]+|
3900 annotation_id=annotation_[^&]+|
3901 x-yt-cl=[0-9]+|
3902 hl=[^&]*|
3903 t=[0-9]+
3904 )?
3905 |
3906 attribution_link\?a=[^&]+
3907 )
3908 $
3909 '''
3910
3911 _TESTS = [{
3912 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
3913 'only_matching': True,
3914 }, {
3915 'url': 'https://www.youtube.com/watch?',
3916 'only_matching': True,
3917 }, {
3918 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3919 'only_matching': True,
3920 }, {
3921 'url': 'https://www.youtube.com/watch?feature=foo',
3922 'only_matching': True,
3923 }, {
3924 'url': 'https://www.youtube.com/watch?hl=en-GB',
3925 'only_matching': True,
3926 }, {
3927 'url': 'https://www.youtube.com/watch?t=2372',
3928 'only_matching': True,
3929 }]
3930
3931 def _real_extract(self, url):
3932 raise ExtractorError(
3933 'Did you forget to quote the URL? Remember that & is a meta '
3934 'character in most shells, so you want to put the URL in quotes, '
3935 'like youtube-dl '
3936 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3937 ' or simply youtube-dl BaW_jenozKc .',
3938 expected=True)
3939
3940
3941class YoutubeTruncatedIDIE(InfoExtractor):
3942 IE_NAME = 'youtube:truncated_id'
3943 IE_DESC = False # Do not list
3944 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
3945
3946 _TESTS = [{
3947 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3948 'only_matching': True,
3949 }]
3950
3951 def _real_extract(self, url):
3952 video_id = self._match_id(url)
3953 raise ExtractorError(
3954 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3955 expected=True)
3956
3957
3958# Do Youtube show urls even exist anymore? I couldn't find any
3959r'''
3960class YoutubeShowIE(YoutubeTabIE):
3961 IE_DESC = 'YouTube.com (multi-season) shows'
3962 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
3963 IE_NAME = 'youtube:show'
3964 _TESTS = [{
3965 'url': 'https://www.youtube.com/show/airdisasters',
3966 'playlist_mincount': 5,
3967 'info_dict': {
3968 'id': 'airdisasters',
3969 'title': 'Air Disasters',
3970 }
3971 }]
3972
3973 def _real_extract(self, url):
3974 playlist_id = self._match_id(url)
3975 return super(YoutubeShowIE, self)._real_extract(
3976 'https://www.youtube.com/show/%s/playlists' % playlist_id)
3977'''