]> jfr.im git - yt-dlp.git/blob - youtube_dlc/extractor/youtube.py
Update to ytdl-2021.01.03
[yt-dlp.git] / youtube_dlc / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5
6 import itertools
7 import json
8 import os.path
9 import random
10 import re
11 import time
12 import traceback
13
14 from .common import InfoExtractor, SearchInfoExtractor
15 from ..jsinterp import JSInterpreter
16 from ..swfinterp import SWFInterpreter
17 from ..compat import (
18 compat_chr,
19 compat_HTTPError,
20 compat_kwargs,
21 compat_parse_qs,
22 compat_urllib_parse_unquote,
23 compat_urllib_parse_unquote_plus,
24 compat_urllib_parse_urlencode,
25 compat_urllib_parse_urlparse,
26 compat_urlparse,
27 compat_str,
28 )
29 from ..utils import (
30 bool_or_none,
31 clean_html,
32 error_to_compat_str,
33 ExtractorError,
34 float_or_none,
35 get_element_by_id,
36 int_or_none,
37 mimetype2ext,
38 parse_codecs,
39 parse_count,
40 parse_duration,
41 remove_quotes,
42 remove_start,
43 smuggle_url,
44 str_or_none,
45 str_to_int,
46 try_get,
47 unescapeHTML,
48 unified_strdate,
49 unsmuggle_url,
50 update_url_query,
51 uppercase_escape,
52 url_or_none,
53 urlencode_postdata,
54 urljoin,
55 )
56
57
58 class YoutubeBaseInfoExtractor(InfoExtractor):
59 """Provide base functions for Youtube extractors"""
60 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
61 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
62
63 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
64 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
65 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
66
67 _RESERVED_NAMES = (
68 r'embed|e|watch_popup|channel|c|user|playlist|watch|w|v|movies|results|shared|'
69 r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout|'
70 r'feed/(?:watch_later|history|subscriptions|library|trending|recommended)')
71
72 _NETRC_MACHINE = 'youtube'
73 # If True it will raise an error if no login info is provided
74 _LOGIN_REQUIRED = False
75
76 _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)'
77
78 def _set_language(self):
79 self._set_cookie(
80 '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
81 # YouTube sets the expire time to about two months
82 expire_time=time.time() + 2 * 30 * 24 * 3600)
83
84 def _ids_to_results(self, ids):
85 return [
86 self.url_result(vid_id, 'Youtube', video_id=vid_id)
87 for vid_id in ids]
88
89 def _login(self):
90 """
91 Attempt to log in to YouTube.
92 True is returned if successful or skipped.
93 False is returned if login failed.
94
95 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
96 """
97 username, password = self._get_login_info()
98 # No authentication to be performed
99 if username is None:
100 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
101 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
102 if self._downloader.params.get('cookiefile') and False: # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
103 self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
104 return True
105
106 login_page = self._download_webpage(
107 self._LOGIN_URL, None,
108 note='Downloading login page',
109 errnote='unable to fetch login page', fatal=False)
110 if login_page is False:
111 return
112
113 login_form = self._hidden_inputs(login_page)
114
115 def req(url, f_req, note, errnote):
116 data = login_form.copy()
117 data.update({
118 'pstMsg': 1,
119 'checkConnection': 'youtube',
120 'checkedDomains': 'youtube',
121 'hl': 'en',
122 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
123 'f.req': json.dumps(f_req),
124 'flowName': 'GlifWebSignIn',
125 'flowEntry': 'ServiceLogin',
126 # TODO: reverse actual botguard identifier generation algo
127 'bgRequest': '["identifier",""]',
128 })
129 return self._download_json(
130 url, None, note=note, errnote=errnote,
131 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
132 fatal=False,
133 data=urlencode_postdata(data), headers={
134 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
135 'Google-Accounts-XSRF': 1,
136 })
137
138 def warn(message):
139 self._downloader.report_warning(message)
140
141 lookup_req = [
142 username,
143 None, [], None, 'US', None, None, 2, False, True,
144 [
145 None, None,
146 [2, 1, None, 1,
147 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
148 None, [], 4],
149 1, [None, None, []], None, None, None, True
150 ],
151 username,
152 ]
153
154 lookup_results = req(
155 self._LOOKUP_URL, lookup_req,
156 'Looking up account info', 'Unable to look up account info')
157
158 if lookup_results is False:
159 return False
160
161 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
162 if not user_hash:
163 warn('Unable to extract user hash')
164 return False
165
166 challenge_req = [
167 user_hash,
168 None, 1, None, [1, None, None, None, [password, None, True]],
169 [
170 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
171 1, [None, None, []], None, None, None, True
172 ]]
173
174 challenge_results = req(
175 self._CHALLENGE_URL, challenge_req,
176 'Logging in', 'Unable to log in')
177
178 if challenge_results is False:
179 return
180
181 login_res = try_get(challenge_results, lambda x: x[0][5], list)
182 if login_res:
183 login_msg = try_get(login_res, lambda x: x[5], compat_str)
184 warn(
185 'Unable to login: %s' % 'Invalid password'
186 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
187 return False
188
189 res = try_get(challenge_results, lambda x: x[0][-1], list)
190 if not res:
191 warn('Unable to extract result entry')
192 return False
193
194 login_challenge = try_get(res, lambda x: x[0][0], list)
195 if login_challenge:
196 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
197 if challenge_str == 'TWO_STEP_VERIFICATION':
198 # SEND_SUCCESS - TFA code has been successfully sent to phone
199 # QUOTA_EXCEEDED - reached the limit of TFA codes
200 status = try_get(login_challenge, lambda x: x[5], compat_str)
201 if status == 'QUOTA_EXCEEDED':
202 warn('Exceeded the limit of TFA codes, try later')
203 return False
204
205 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
206 if not tl:
207 warn('Unable to extract TL')
208 return False
209
210 tfa_code = self._get_tfa_info('2-step verification code')
211
212 if not tfa_code:
213 warn(
214 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
215 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
216 return False
217
218 tfa_code = remove_start(tfa_code, 'G-')
219
220 tfa_req = [
221 user_hash, None, 2, None,
222 [
223 9, None, None, None, None, None, None, None,
224 [None, tfa_code, True, 2]
225 ]]
226
227 tfa_results = req(
228 self._TFA_URL.format(tl), tfa_req,
229 'Submitting TFA code', 'Unable to submit TFA code')
230
231 if tfa_results is False:
232 return False
233
234 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
235 if tfa_res:
236 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
237 warn(
238 'Unable to finish TFA: %s' % 'Invalid TFA code'
239 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
240 return False
241
242 check_cookie_url = try_get(
243 tfa_results, lambda x: x[0][-1][2], compat_str)
244 else:
245 CHALLENGES = {
246 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
247 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
248 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
249 }
250 challenge = CHALLENGES.get(
251 challenge_str,
252 '%s returned error %s.' % (self.IE_NAME, challenge_str))
253 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
254 return False
255 else:
256 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
257
258 if not check_cookie_url:
259 warn('Unable to extract CheckCookie URL')
260 return False
261
262 check_cookie_results = self._download_webpage(
263 check_cookie_url, None, 'Checking cookie', fatal=False)
264
265 if check_cookie_results is False:
266 return False
267
268 if 'https://myaccount.google.com/' not in check_cookie_results:
269 warn('Unable to log in')
270 return False
271
272 return True
273
274 def _download_webpage_handle(self, *args, **kwargs):
275 query = kwargs.get('query', {}).copy()
276 kwargs['query'] = query
277 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
278 *args, **compat_kwargs(kwargs))
279
280 def _get_yt_initial_data(self, video_id, webpage):
281 config = self._search_regex(
282 (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
283 r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
284 webpage, 'ytInitialData', default=None)
285 if config:
286 return self._parse_json(
287 uppercase_escape(config), video_id, fatal=False)
288
289 def _real_initialize(self):
290 if self._downloader is None:
291 return
292 self._set_language()
293 if not self._login():
294 return
295
296 _DEFAULT_API_DATA = {
297 'context': {
298 'client': {
299 'clientName': 'WEB',
300 'clientVersion': '2.20201021.03.00',
301 }
302 },
303 }
304
305 _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
306 _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
307 _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
308
309 def _call_api(self, ep, query, video_id):
310 data = self._DEFAULT_API_DATA.copy()
311 data.update(query)
312
313 response = self._download_json(
314 'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id,
315 note='Downloading API JSON', errnote='Unable to download API page',
316 data=json.dumps(data).encode('utf8'),
317 headers={'content-type': 'application/json'},
318 query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'})
319
320 return response
321
322 def _extract_yt_initial_data(self, video_id, webpage):
323 return self._parse_json(
324 self._search_regex(
325 (r'%s\s*%s' % (self._YT_INITIAL_DATA_RE, self._YT_INITIAL_BOUNDARY_RE),
326 self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
327 video_id)
328
329 def _extract_ytcfg(self, video_id, webpage):
330 return self._parse_json(
331 self._search_regex(
332 r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg',
333 default='{}'), video_id, fatal=False)
334
335
336 class YoutubeIE(YoutubeBaseInfoExtractor):
337 IE_DESC = 'YouTube.com'
338 _VALID_URL = r"""(?x)^
339 (
340 (?:https?://|//) # http(s):// or protocol-independent URL
341 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
342 (?:www\.)?deturl\.com/www\.youtube\.com/|
343 (?:www\.)?pwnyoutube\.com/|
344 (?:www\.)?hooktube\.com/|
345 (?:www\.)?yourepeat\.com/|
346 tube\.majestyc\.net/|
347 # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
348 (?:(?:www|dev)\.)?invidio\.us/|
349 (?:(?:www|no)\.)?invidiou\.sh/|
350 (?:(?:www|fi)\.)?invidious\.snopyta\.org/|
351 (?:www\.)?invidious\.kabi\.tk/|
352 (?:www\.)?invidious\.13ad\.de/|
353 (?:www\.)?invidious\.mastodon\.host/|
354 (?:www\.)?invidious\.zapashcanon\.fr/|
355 (?:www\.)?invidious\.kavin\.rocks/|
356 (?:www\.)?invidious\.tube/|
357 (?:www\.)?invidiou\.site/|
358 (?:www\.)?invidious\.site/|
359 (?:www\.)?invidious\.xyz/|
360 (?:www\.)?invidious\.nixnet\.xyz/|
361 (?:www\.)?invidious\.drycat\.fr/|
362 (?:www\.)?tube\.poal\.co/|
363 (?:www\.)?tube\.connect\.cafe/|
364 (?:www\.)?vid\.wxzm\.sx/|
365 (?:www\.)?vid\.mint\.lgbt/|
366 (?:www\.)?yewtu\.be/|
367 (?:www\.)?yt\.elukerio\.org/|
368 (?:www\.)?yt\.lelux\.fi/|
369 (?:www\.)?invidious\.ggc-project\.de/|
370 (?:www\.)?yt\.maisputain\.ovh/|
371 (?:www\.)?invidious\.13ad\.de/|
372 (?:www\.)?invidious\.toot\.koeln/|
373 (?:www\.)?invidious\.fdn\.fr/|
374 (?:www\.)?watch\.nettohikari\.com/|
375 (?:www\.)?kgg2m7yk5aybusll\.onion/|
376 (?:www\.)?qklhadlycap4cnod\.onion/|
377 (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
378 (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
379 (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
380 (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
381 (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
382 (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
383 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
384 (?:.*?\#/)? # handle anchor (#/) redirect urls
385 (?: # the various things that can precede the ID:
386 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
387 |(?: # or the v= param in all its forms
388 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
389 (?:\?|\#!?) # the params delimiter ? or # or #!
390 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
391 v=
392 )
393 ))
394 |(?:
395 youtu\.be| # just youtu.be/xxxx
396 vid\.plus| # or vid.plus/xxxx
397 zwearz\.com/watch| # or zwearz.com/watch/xxxx
398 )/
399 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
400 )
401 )? # all until now is optional -> you can pass the naked ID
402 (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
403 (?!.*?\blist=
404 (?:
405 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
406 WL # WL are handled by the watch later IE
407 )
408 )
409 (?(1).+)? # if we found the ID, everything can follow
410 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
411 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
412 _PLAYER_INFO_RE = (
413 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P<ext>[a-z]+)$',
414 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.(?P<ext>[a-z]+)$',
415 )
416 _formats = {
417 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
418 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
419 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
420 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
421 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
422 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
423 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
424 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
425 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
426 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
427 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
428 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
429 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
430 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
431 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
432 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
433 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
434 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
435
436
437 # 3D videos
438 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
439 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
440 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
441 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
442 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
443 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
444 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
445
446 # Apple HTTP Live Streaming
447 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
448 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
449 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
450 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
451 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
452 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
453 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
454 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
455
456 # DASH mp4 video
457 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
458 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
459 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
460 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
461 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
462 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
463 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
464 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
465 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
466 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
467 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
468 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
469
470 # Dash mp4 audio
471 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
472 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
473 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
474 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
475 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
476 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
477 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
478
479 # Dash webm
480 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
481 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
482 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
483 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
484 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
485 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
486 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
487 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
488 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
489 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
490 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
491 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
492 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
493 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
494 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
495 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
496 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
497 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
498 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
499 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
500 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
501 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
502
503 # Dash webm audio
504 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
505 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
506
507 # Dash webm audio with opus inside
508 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
509 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
510 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
511
512 # RTMP (unnamed)
513 '_rtmp': {'protocol': 'rtmp'},
514
515 # av01 video only formats sometimes served with "unknown" codecs
516 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
517 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
518 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
519 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
520 }
521 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
522
523 _GEO_BYPASS = False
524
525 IE_NAME = 'youtube'
526 _TESTS = [
527 {
528 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
529 'info_dict': {
530 'id': 'BaW_jenozKc',
531 'ext': 'mp4',
532 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
533 'uploader': 'Philipp Hagemeister',
534 'uploader_id': 'phihag',
535 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
536 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
537 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
538 'upload_date': '20121002',
539 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
540 'categories': ['Science & Technology'],
541 'tags': ['youtube-dl'],
542 'duration': 10,
543 'view_count': int,
544 'like_count': int,
545 'dislike_count': int,
546 'start_time': 1,
547 'end_time': 9,
548 }
549 },
550 {
551 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
552 'note': 'Embed-only video (#1746)',
553 'info_dict': {
554 'id': 'yZIXLfi8CZQ',
555 'ext': 'mp4',
556 'upload_date': '20120608',
557 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
558 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
559 'uploader': 'SET India',
560 'uploader_id': 'setindia',
561 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
562 'age_limit': 18,
563 }
564 },
565 {
566 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ',
567 'note': 'Use the first video ID in the URL',
568 'info_dict': {
569 'id': 'BaW_jenozKc',
570 'ext': 'mp4',
571 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
572 'uploader': 'Philipp Hagemeister',
573 'uploader_id': 'phihag',
574 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
575 'upload_date': '20121002',
576 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
577 'categories': ['Science & Technology'],
578 'tags': ['youtube-dl'],
579 'duration': 10,
580 'view_count': int,
581 'like_count': int,
582 'dislike_count': int,
583 },
584 'params': {
585 'skip_download': True,
586 },
587 },
588 {
589 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
590 'note': '256k DASH audio (format 141) via DASH manifest',
591 'info_dict': {
592 'id': 'a9LDPn-MO4I',
593 'ext': 'm4a',
594 'upload_date': '20121002',
595 'uploader_id': '8KVIDEO',
596 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
597 'description': '',
598 'uploader': '8KVIDEO',
599 'title': 'UHDTV TEST 8K VIDEO.mp4'
600 },
601 'params': {
602 'youtube_include_dash_manifest': True,
603 'format': '141',
604 },
605 'skip': 'format 141 not served anymore',
606 },
607 # DASH manifest with encrypted signature
608 {
609 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
610 'info_dict': {
611 'id': 'IB3lcPjvWLA',
612 'ext': 'm4a',
613 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
614 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
615 'duration': 244,
616 'uploader': 'AfrojackVEVO',
617 'uploader_id': 'AfrojackVEVO',
618 'upload_date': '20131011',
619 },
620 'params': {
621 'youtube_include_dash_manifest': True,
622 'format': '141/bestaudio[ext=m4a]',
623 },
624 },
625 # Controversy video
626 {
627 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
628 'info_dict': {
629 'id': 'T4XJQO3qol8',
630 'ext': 'mp4',
631 'duration': 219,
632 'upload_date': '20100909',
633 'uploader': 'Amazing Atheist',
634 'uploader_id': 'TheAmazingAtheist',
635 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
636 'title': 'Burning Everyone\'s Koran',
637 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
638 }
639 },
640 # Normal age-gate video (embed allowed)
641 {
642 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
643 'info_dict': {
644 'id': 'HtVdAasjOgU',
645 'ext': 'mp4',
646 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
647 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
648 'duration': 142,
649 'uploader': 'The Witcher',
650 'uploader_id': 'WitcherGame',
651 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
652 'upload_date': '20140605',
653 'age_limit': 18,
654 },
655 },
656 # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
657 # YouTube Red ad is not captured for creator
658 {
659 'url': '__2ABJjxzNo',
660 'info_dict': {
661 'id': '__2ABJjxzNo',
662 'ext': 'mp4',
663 'duration': 266,
664 'upload_date': '20100430',
665 'uploader_id': 'deadmau5',
666 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
667 'creator': 'Dada Life, deadmau5',
668 'description': 'md5:12c56784b8032162bb936a5f76d55360',
669 'uploader': 'deadmau5',
670 'title': 'Deadmau5 - Some Chords (HD)',
671 'alt_title': 'This Machine Kills Some Chords',
672 },
673 'expected_warnings': [
674 'DASH manifest missing',
675 ]
676 },
677 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
678 {
679 'url': 'lqQg6PlCWgI',
680 'info_dict': {
681 'id': 'lqQg6PlCWgI',
682 'ext': 'mp4',
683 'duration': 6085,
684 'upload_date': '20150827',
685 'uploader_id': 'olympic',
686 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
687 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
688 'uploader': 'Olympic',
689 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
690 },
691 'params': {
692 'skip_download': 'requires avconv',
693 }
694 },
695 # Non-square pixels
696 {
697 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
698 'info_dict': {
699 'id': '_b-2C3KPAM0',
700 'ext': 'mp4',
701 'stretched_ratio': 16 / 9.,
702 'duration': 85,
703 'upload_date': '20110310',
704 'uploader_id': 'AllenMeow',
705 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
706 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
707 'uploader': '孫ᄋᄅ',
708 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
709 },
710 },
711 # url_encoded_fmt_stream_map is empty string
712 {
713 'url': 'qEJwOuvDf7I',
714 'info_dict': {
715 'id': 'qEJwOuvDf7I',
716 'ext': 'webm',
717 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
718 'description': '',
719 'upload_date': '20150404',
720 'uploader_id': 'spbelect',
721 'uploader': 'Наблюдатели Петербурга',
722 },
723 'params': {
724 'skip_download': 'requires avconv',
725 },
726 'skip': 'This live event has ended.',
727 },
728 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
729 {
730 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
731 'info_dict': {
732 'id': 'FIl7x6_3R5Y',
733 'ext': 'webm',
734 'title': 'md5:7b81415841e02ecd4313668cde88737a',
735 'description': 'md5:116377fd2963b81ec4ce64b542173306',
736 'duration': 220,
737 'upload_date': '20150625',
738 'uploader_id': 'dorappi2000',
739 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
740 'uploader': 'dorappi2000',
741 'formats': 'mincount:31',
742 },
743 'skip': 'not actual anymore',
744 },
745 # DASH manifest with segment_list
746 {
747 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
748 'md5': '8ce563a1d667b599d21064e982ab9e31',
749 'info_dict': {
750 'id': 'CsmdDsKjzN8',
751 'ext': 'mp4',
752 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
753 'uploader': 'Airtek',
754 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
755 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
756 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
757 },
758 'params': {
759 'youtube_include_dash_manifest': True,
760 'format': '135', # bestvideo
761 },
762 'skip': 'This live event has ended.',
763 },
764 {
765 # Multifeed videos (multiple cameras), URL is for Main Camera
766 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
767 'info_dict': {
768 'id': 'jqWvoWXjCVs',
769 'title': 'teamPGP: Rocket League Noob Stream',
770 'description': 'md5:dc7872fb300e143831327f1bae3af010',
771 },
772 'playlist': [{
773 'info_dict': {
774 'id': 'jqWvoWXjCVs',
775 'ext': 'mp4',
776 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
777 'description': 'md5:dc7872fb300e143831327f1bae3af010',
778 'duration': 7335,
779 'upload_date': '20150721',
780 'uploader': 'Beer Games Beer',
781 'uploader_id': 'beergamesbeer',
782 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
783 'license': 'Standard YouTube License',
784 },
785 }, {
786 'info_dict': {
787 'id': '6h8e8xoXJzg',
788 'ext': 'mp4',
789 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
790 'description': 'md5:dc7872fb300e143831327f1bae3af010',
791 'duration': 7337,
792 'upload_date': '20150721',
793 'uploader': 'Beer Games Beer',
794 'uploader_id': 'beergamesbeer',
795 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
796 'license': 'Standard YouTube License',
797 },
798 }, {
799 'info_dict': {
800 'id': 'PUOgX5z9xZw',
801 'ext': 'mp4',
802 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
803 'description': 'md5:dc7872fb300e143831327f1bae3af010',
804 'duration': 7337,
805 'upload_date': '20150721',
806 'uploader': 'Beer Games Beer',
807 'uploader_id': 'beergamesbeer',
808 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
809 'license': 'Standard YouTube License',
810 },
811 }, {
812 'info_dict': {
813 'id': 'teuwxikvS5k',
814 'ext': 'mp4',
815 'title': 'teamPGP: Rocket League Noob Stream (zim)',
816 'description': 'md5:dc7872fb300e143831327f1bae3af010',
817 'duration': 7334,
818 'upload_date': '20150721',
819 'uploader': 'Beer Games Beer',
820 'uploader_id': 'beergamesbeer',
821 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
822 'license': 'Standard YouTube License',
823 },
824 }],
825 'params': {
826 'skip_download': True,
827 },
828 'skip': 'This video is not available.',
829 },
830 {
831 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
832 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
833 'info_dict': {
834 'id': 'gVfLd0zydlo',
835 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
836 },
837 'playlist_count': 2,
838 'skip': 'Not multifeed anymore',
839 },
840 {
841 'url': 'https://vid.plus/FlRa-iH7PGw',
842 'only_matching': True,
843 },
844 {
845 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
846 'only_matching': True,
847 },
848 {
849 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
850 # Also tests cut-off URL expansion in video description (see
851 # https://github.com/ytdl-org/youtube-dl/issues/1892,
852 # https://github.com/ytdl-org/youtube-dl/issues/8164)
853 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
854 'info_dict': {
855 'id': 'lsguqyKfVQg',
856 'ext': 'mp4',
857 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
858 'alt_title': 'Dark Walk - Position Music',
859 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
860 'duration': 133,
861 'upload_date': '20151119',
862 'uploader_id': 'IronSoulElf',
863 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
864 'uploader': 'IronSoulElf',
865 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
866 'track': 'Dark Walk - Position Music',
867 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
868 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
869 },
870 'params': {
871 'skip_download': True,
872 },
873 },
874 {
875 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
876 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
877 'only_matching': True,
878 },
879 {
880 # Video with yt:stretch=17:0
881 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
882 'info_dict': {
883 'id': 'Q39EVAstoRM',
884 'ext': 'mp4',
885 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
886 'description': 'md5:ee18a25c350637c8faff806845bddee9',
887 'upload_date': '20151107',
888 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
889 'uploader': 'CH GAMER DROID',
890 },
891 'params': {
892 'skip_download': True,
893 },
894 'skip': 'This video does not exist.',
895 },
896 {
897 # Video licensed under Creative Commons
898 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
899 'info_dict': {
900 'id': 'M4gD1WSo5mA',
901 'ext': 'mp4',
902 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
903 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
904 'duration': 721,
905 'upload_date': '20150127',
906 'uploader_id': 'BerkmanCenter',
907 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
908 'uploader': 'The Berkman Klein Center for Internet & Society',
909 'license': 'Creative Commons Attribution license (reuse allowed)',
910 },
911 'params': {
912 'skip_download': True,
913 },
914 },
915 {
916 # Channel-like uploader_url
917 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
918 'info_dict': {
919 'id': 'eQcmzGIKrzg',
920 'ext': 'mp4',
921 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
922 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
923 'duration': 4060,
924 'upload_date': '20151119',
925 'uploader': 'Bernie Sanders',
926 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
927 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
928 'license': 'Creative Commons Attribution license (reuse allowed)',
929 },
930 'params': {
931 'skip_download': True,
932 },
933 },
934 {
935 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
936 'only_matching': True,
937 },
938 {
939 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
940 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
941 'only_matching': True,
942 },
943 {
944 # Rental video preview
945 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
946 'info_dict': {
947 'id': 'uGpuVWrhIzE',
948 'ext': 'mp4',
949 'title': 'Piku - Trailer',
950 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
951 'upload_date': '20150811',
952 'uploader': 'FlixMatrix',
953 'uploader_id': 'FlixMatrixKaravan',
954 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
955 'license': 'Standard YouTube License',
956 },
957 'params': {
958 'skip_download': True,
959 },
960 'skip': 'This video is not available.',
961 },
962 {
963 # YouTube Red video with episode data
964 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
965 'info_dict': {
966 'id': 'iqKdEhx-dD4',
967 'ext': 'mp4',
968 'title': 'Isolation - Mind Field (Ep 1)',
969 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
970 'duration': 2085,
971 'upload_date': '20170118',
972 'uploader': 'Vsauce',
973 'uploader_id': 'Vsauce',
974 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
975 'series': 'Mind Field',
976 'season_number': 1,
977 'episode_number': 1,
978 },
979 'params': {
980 'skip_download': True,
981 },
982 'expected_warnings': [
983 'Skipping DASH manifest',
984 ],
985 },
986 {
987 # The following content has been identified by the YouTube community
988 # as inappropriate or offensive to some audiences.
989 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
990 'info_dict': {
991 'id': '6SJNVb0GnPI',
992 'ext': 'mp4',
993 'title': 'Race Differences in Intelligence',
994 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
995 'duration': 965,
996 'upload_date': '20140124',
997 'uploader': 'New Century Foundation',
998 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
999 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
1000 },
1001 'params': {
1002 'skip_download': True,
1003 },
1004 },
1005 {
1006 # itag 212
1007 'url': '1t24XAntNCY',
1008 'only_matching': True,
1009 },
1010 {
1011 # geo restricted to JP
1012 'url': 'sJL6WA-aGkQ',
1013 'only_matching': True,
1014 },
1015 {
1016 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1017 'only_matching': True,
1018 },
1019 {
1020 # DRM protected
1021 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1022 'only_matching': True,
1023 },
1024 {
1025 # Video with unsupported adaptive stream type formats
1026 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1027 'info_dict': {
1028 'id': 'Z4Vy8R84T1U',
1029 'ext': 'mp4',
1030 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1031 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1032 'duration': 433,
1033 'upload_date': '20130923',
1034 'uploader': 'Amelia Putri Harwita',
1035 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1036 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1037 'formats': 'maxcount:10',
1038 },
1039 'params': {
1040 'skip_download': True,
1041 'youtube_include_dash_manifest': False,
1042 },
1043 'skip': 'not actual anymore',
1044 },
1045 {
1046 # Youtube Music Auto-generated description
1047 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1048 'info_dict': {
1049 'id': 'MgNrAu2pzNs',
1050 'ext': 'mp4',
1051 'title': 'Voyeur Girl',
1052 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1053 'upload_date': '20190312',
1054 'uploader': 'Stephen - Topic',
1055 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
1056 'artist': 'Stephen',
1057 'track': 'Voyeur Girl',
1058 'album': 'it\'s too much love to know my dear',
1059 'release_date': '20190313',
1060 'release_year': 2019,
1061 },
1062 'params': {
1063 'skip_download': True,
1064 },
1065 },
1066 {
1067 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1068 'only_matching': True,
1069 },
1070 {
1071 # invalid -> valid video id redirection
1072 'url': 'DJztXj2GPfl',
1073 'info_dict': {
1074 'id': 'DJztXj2GPfk',
1075 'ext': 'mp4',
1076 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1077 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1078 'upload_date': '20090125',
1079 'uploader': 'Prochorowka',
1080 'uploader_id': 'Prochorowka',
1081 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1082 'artist': 'Panjabi MC',
1083 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1084 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1085 },
1086 'params': {
1087 'skip_download': True,
1088 },
1089 },
1090 {
1091 # empty description results in an empty string
1092 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1093 'info_dict': {
1094 'id': 'x41yOUIvK2k',
1095 'ext': 'mp4',
1096 'title': 'IMG 3456',
1097 'description': '',
1098 'upload_date': '20170613',
1099 'uploader_id': 'ElevageOrVert',
1100 'uploader': 'ElevageOrVert',
1101 },
1102 'params': {
1103 'skip_download': True,
1104 },
1105 },
1106 {
1107 # with '};' inside yt initial data (see [1])
1108 # see [2] for an example with '};' inside ytInitialPlayerResponse
1109 # 1. https://github.com/ytdl-org/youtube-dl/issues/27093
1110 # 2. https://github.com/ytdl-org/youtube-dl/issues/27216
1111 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no',
1112 'info_dict': {
1113 'id': 'CHqg6qOn4no',
1114 'ext': 'mp4',
1115 'title': 'Part 77 Sort a list of simple types in c#',
1116 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc',
1117 'upload_date': '20130831',
1118 'uploader_id': 'kudvenkat',
1119 'uploader': 'kudvenkat',
1120 },
1121 'params': {
1122 'skip_download': True,
1123 },
1124 },
1125 {
1126 # another example of '};' in ytInitialData
1127 'url': 'https://www.youtube.com/watch?v=gVfgbahppCY',
1128 'only_matching': True,
1129 },
1130 {
1131 'url': 'https://www.youtube.com/watch_popup?v=63RmMXCd_bQ',
1132 'only_matching': True,
1133 },
1134 ]
1135
1136 def __init__(self, *args, **kwargs):
1137 super(YoutubeIE, self).__init__(*args, **kwargs)
1138 self._player_cache = {}
1139
1140 def report_video_info_webpage_download(self, video_id):
1141 """Report attempt to download video info webpage."""
1142 self.to_screen('%s: Downloading video info webpage' % video_id)
1143
1144 def report_information_extraction(self, video_id):
1145 """Report attempt to extract video information."""
1146 self.to_screen('%s: Extracting video information' % video_id)
1147
1148 def report_unavailable_format(self, video_id, format):
1149 """Report extracted video URL."""
1150 self.to_screen('%s: Format %s not available' % (video_id, format))
1151
1152 def report_rtmp_download(self):
1153 """Indicate the download will use the RTMP protocol."""
1154 self.to_screen('RTMP download detected')
1155
1156 def _signature_cache_id(self, example_sig):
1157 """ Return a string representation of a signature """
1158 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
1159
1160 @classmethod
1161 def _extract_player_info(cls, player_url):
1162 for player_re in cls._PLAYER_INFO_RE:
1163 id_m = re.search(player_re, player_url)
1164 if id_m:
1165 break
1166 else:
1167 raise ExtractorError('Cannot identify player %r' % player_url)
1168 return id_m.group('ext'), id_m.group('id')
1169
1170 def _extract_signature_function(self, video_id, player_url, example_sig):
1171 player_type, player_id = self._extract_player_info(player_url)
1172
1173 # Read from filesystem cache
1174 func_id = '%s_%s_%s' % (
1175 player_type, player_id, self._signature_cache_id(example_sig))
1176 assert os.path.basename(func_id) == func_id
1177
1178 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
1179 if cache_spec is not None:
1180 return lambda s: ''.join(s[i] for i in cache_spec)
1181
1182 download_note = (
1183 'Downloading player %s' % player_url
1184 if self._downloader.params.get('verbose') else
1185 'Downloading %s player %s' % (player_type, player_id)
1186 )
1187 if player_type == 'js':
1188 code = self._download_webpage(
1189 player_url, video_id,
1190 note=download_note,
1191 errnote='Download of %s failed' % player_url)
1192 res = self._parse_sig_js(code)
1193 elif player_type == 'swf':
1194 urlh = self._request_webpage(
1195 player_url, video_id,
1196 note=download_note,
1197 errnote='Download of %s failed' % player_url)
1198 code = urlh.read()
1199 res = self._parse_sig_swf(code)
1200 else:
1201 assert False, 'Invalid player type %r' % player_type
1202
1203 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1204 cache_res = res(test_string)
1205 cache_spec = [ord(c) for c in cache_res]
1206
1207 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
1208 return res
1209
1210 def _print_sig_code(self, func, example_sig):
1211 def gen_sig_code(idxs):
1212 def _genslice(start, end, step):
1213 starts = '' if start == 0 else str(start)
1214 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
1215 steps = '' if step == 1 else (':%d' % step)
1216 return 's[%s%s%s]' % (starts, ends, steps)
1217
1218 step = None
1219 # Quelch pyflakes warnings - start will be set when step is set
1220 start = '(Never used)'
1221 for i, prev in zip(idxs[1:], idxs[:-1]):
1222 if step is not None:
1223 if i - prev == step:
1224 continue
1225 yield _genslice(start, prev, step)
1226 step = None
1227 continue
1228 if i - prev in [-1, 1]:
1229 step = i - prev
1230 start = prev
1231 continue
1232 else:
1233 yield 's[%d]' % prev
1234 if step is None:
1235 yield 's[%d]' % i
1236 else:
1237 yield _genslice(start, i, step)
1238
1239 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1240 cache_res = func(test_string)
1241 cache_spec = [ord(c) for c in cache_res]
1242 expr_code = ' + '.join(gen_sig_code(cache_spec))
1243 signature_id_tuple = '(%s)' % (
1244 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
1245 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
1246 ' return %s\n') % (signature_id_tuple, expr_code)
1247 self.to_screen('Extracted signature function:\n' + code)
1248
1249 def _parse_sig_js(self, jscode):
1250 funcname = self._search_regex(
1251 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1252 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1253 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1254 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1255 # Obsolete patterns
1256 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1257 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
1258 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1259 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1260 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1261 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1262 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1263 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
1264 jscode, 'Initial JS player signature function name', group='sig')
1265
1266 jsi = JSInterpreter(jscode)
1267 initial_function = jsi.extract_function(funcname)
1268 return lambda s: initial_function([s])
1269
1270 def _parse_sig_swf(self, file_contents):
1271 swfi = SWFInterpreter(file_contents)
1272 TARGET_CLASSNAME = 'SignatureDecipher'
1273 searched_class = swfi.extract_class(TARGET_CLASSNAME)
1274 initial_function = swfi.extract_function(searched_class, 'decipher')
1275 return lambda s: initial_function([s])
1276
1277 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1278 """Turn the encrypted s field into a working signature"""
1279
1280 if player_url is None:
1281 raise ExtractorError('Cannot decrypt signature without player_url')
1282
1283 if player_url.startswith('//'):
1284 player_url = 'https:' + player_url
1285 elif not re.match(r'https?://', player_url):
1286 player_url = compat_urlparse.urljoin(
1287 'https://www.youtube.com', player_url)
1288 try:
1289 player_id = (player_url, self._signature_cache_id(s))
1290 if player_id not in self._player_cache:
1291 func = self._extract_signature_function(
1292 video_id, player_url, s
1293 )
1294 self._player_cache[player_id] = func
1295 func = self._player_cache[player_id]
1296 if self._downloader.params.get('youtube_print_sig_code'):
1297 self._print_sig_code(func, s)
1298 return func(s)
1299 except Exception as e:
1300 tb = traceback.format_exc()
1301 raise ExtractorError(
1302 'Signature extraction failed: ' + tb, cause=e)
1303
1304 def _get_subtitles(self, video_id, webpage, has_live_chat_replay):
1305 try:
1306 subs_doc = self._download_xml(
1307 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1308 video_id, note=False)
1309 except ExtractorError as err:
1310 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
1311 return {}
1312
1313 sub_lang_list = {}
1314 for track in subs_doc.findall('track'):
1315 lang = track.attrib['lang_code']
1316 if lang in sub_lang_list:
1317 continue
1318 sub_formats = []
1319 for ext in self._SUBTITLE_FORMATS:
1320 params = compat_urllib_parse_urlencode({
1321 'lang': lang,
1322 'v': video_id,
1323 'fmt': ext,
1324 'name': track.attrib['name'].encode('utf-8'),
1325 })
1326 sub_formats.append({
1327 'url': 'https://www.youtube.com/api/timedtext?' + params,
1328 'ext': ext,
1329 })
1330 sub_lang_list[lang] = sub_formats
1331 if has_live_chat_replay:
1332 sub_lang_list['live_chat'] = [
1333 {
1334 'video_id': video_id,
1335 'ext': 'json',
1336 'protocol': 'youtube_live_chat_replay',
1337 },
1338 ]
1339 if not sub_lang_list:
1340 self._downloader.report_warning('video doesn\'t have subtitles')
1341 return {}
1342 return sub_lang_list
1343
1344 def _get_ytplayer_config(self, video_id, webpage):
1345 patterns = (
1346 # User data may contain arbitrary character sequences that may affect
1347 # JSON extraction with regex, e.g. when '};' is contained the second
1348 # regex won't capture the whole JSON. Yet working around by trying more
1349 # concrete regex first keeping in mind proper quoted string handling
1350 # to be implemented in future that will replace this workaround (see
1351 # https://github.com/ytdl-org/youtube-dl/issues/7468,
1352 # https://github.com/ytdl-org/youtube-dl/pull/7599)
1353 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1354 r';ytplayer\.config\s*=\s*({.+?});',
1355 )
1356 config = self._search_regex(
1357 patterns, webpage, 'ytplayer.config', default=None)
1358 if config:
1359 return self._parse_json(
1360 uppercase_escape(config), video_id, fatal=False)
1361
1362 def _get_automatic_captions(self, video_id, player_response, player_config):
1363 """We need the webpage for getting the captions url, pass it as an
1364 argument to speed up the process."""
1365 self.to_screen('%s: Looking for automatic captions' % video_id)
1366 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
1367 if not (player_response or player_config):
1368 self._downloader.report_warning(err_msg)
1369 return {}
1370 try:
1371 args = player_config.get('args') if player_config else {}
1372 caption_url = args.get('ttsurl')
1373 if caption_url:
1374 timestamp = args['timestamp']
1375 # We get the available subtitles
1376 list_params = compat_urllib_parse_urlencode({
1377 'type': 'list',
1378 'tlangs': 1,
1379 'asrs': 1,
1380 })
1381 list_url = caption_url + '&' + list_params
1382 caption_list = self._download_xml(list_url, video_id)
1383 original_lang_node = caption_list.find('track')
1384 if original_lang_node is None:
1385 self._downloader.report_warning('Video doesn\'t have automatic captions')
1386 return {}
1387 original_lang = original_lang_node.attrib['lang_code']
1388 caption_kind = original_lang_node.attrib.get('kind', '')
1389
1390 sub_lang_list = {}
1391 for lang_node in caption_list.findall('target'):
1392 sub_lang = lang_node.attrib['lang_code']
1393 sub_formats = []
1394 for ext in self._SUBTITLE_FORMATS:
1395 params = compat_urllib_parse_urlencode({
1396 'lang': original_lang,
1397 'tlang': sub_lang,
1398 'fmt': ext,
1399 'ts': timestamp,
1400 'kind': caption_kind,
1401 })
1402 sub_formats.append({
1403 'url': caption_url + '&' + params,
1404 'ext': ext,
1405 })
1406 sub_lang_list[sub_lang] = sub_formats
1407 return sub_lang_list
1408
1409 def make_captions(sub_url, sub_langs):
1410 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1411 caption_qs = compat_parse_qs(parsed_sub_url.query)
1412 captions = {}
1413 for sub_lang in sub_langs:
1414 sub_formats = []
1415 for ext in self._SUBTITLE_FORMATS:
1416 caption_qs.update({
1417 'tlang': [sub_lang],
1418 'fmt': [ext],
1419 })
1420 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1421 query=compat_urllib_parse_urlencode(caption_qs, True)))
1422 sub_formats.append({
1423 'url': sub_url,
1424 'ext': ext,
1425 })
1426 captions[sub_lang] = sub_formats
1427 return captions
1428
1429 # New captions format as of 22.06.2017
1430 if player_response:
1431 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1432 base_url = renderer['captionTracks'][0]['baseUrl']
1433 sub_lang_list = []
1434 for lang in renderer['translationLanguages']:
1435 lang_code = lang.get('languageCode')
1436 if lang_code:
1437 sub_lang_list.append(lang_code)
1438 return make_captions(base_url, sub_lang_list)
1439
1440 # Some videos don't provide ttsurl but rather caption_tracks and
1441 # caption_translation_languages (e.g. 20LmZk1hakA)
1442 # Does not used anymore as of 22.06.2017
1443 caption_tracks = args['caption_tracks']
1444 caption_translation_languages = args['caption_translation_languages']
1445 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
1446 sub_lang_list = []
1447 for lang in caption_translation_languages.split(','):
1448 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1449 sub_lang = lang_qs.get('lc', [None])[0]
1450 if sub_lang:
1451 sub_lang_list.append(sub_lang)
1452 return make_captions(caption_url, sub_lang_list)
1453 # An extractor error can be raise by the download process if there are
1454 # no automatic captions but there are subtitles
1455 except (KeyError, IndexError, ExtractorError):
1456 self._downloader.report_warning(err_msg)
1457 return {}
1458
1459 def _mark_watched(self, video_id, video_info, player_response):
1460 playback_url = url_or_none(try_get(
1461 player_response,
1462 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
1463 video_info, lambda x: x['videostats_playback_base_url'][0]))
1464 if not playback_url:
1465 return
1466 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1467 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1468
1469 # cpn generation algorithm is reverse engineered from base.js.
1470 # In fact it works even with dummy cpn.
1471 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1472 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1473
1474 qs.update({
1475 'ver': ['2'],
1476 'cpn': [cpn],
1477 })
1478 playback_url = compat_urlparse.urlunparse(
1479 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
1480
1481 self._download_webpage(
1482 playback_url, video_id, 'Marking watched',
1483 'Unable to mark watched', fatal=False)
1484
1485 @staticmethod
1486 def _extract_urls(webpage):
1487 # Embedded YouTube player
1488 entries = [
1489 unescapeHTML(mobj.group('url'))
1490 for mobj in re.finditer(r'''(?x)
1491 (?:
1492 <iframe[^>]+?src=|
1493 data-video-url=|
1494 <embed[^>]+?src=|
1495 embedSWF\(?:\s*|
1496 <object[^>]+data=|
1497 new\s+SWFObject\(
1498 )
1499 (["\'])
1500 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1501 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
1502 \1''', webpage)]
1503
1504 # lazyYT YouTube embed
1505 entries.extend(list(map(
1506 unescapeHTML,
1507 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1508
1509 # Wordpress "YouTube Video Importer" plugin
1510 matches = re.findall(r'''(?x)<div[^>]+
1511 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1512 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1513 entries.extend(m[-1] for m in matches)
1514
1515 return entries
1516
1517 @staticmethod
1518 def _extract_url(webpage):
1519 urls = YoutubeIE._extract_urls(webpage)
1520 return urls[0] if urls else None
1521
1522 @classmethod
1523 def extract_id(cls, url):
1524 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1525 if mobj is None:
1526 raise ExtractorError('Invalid URL: %s' % url)
1527 video_id = mobj.group(2)
1528 return video_id
1529
1530 def _extract_chapters_from_json(self, webpage, video_id, duration):
1531 if not webpage:
1532 return
1533 data = self._extract_yt_initial_data(video_id, webpage)
1534 if not data or not isinstance(data, dict):
1535 return
1536 chapters_list = try_get(
1537 data,
1538 lambda x: x['playerOverlays']
1539 ['playerOverlayRenderer']
1540 ['decoratedPlayerBarRenderer']
1541 ['decoratedPlayerBarRenderer']
1542 ['playerBar']
1543 ['chapteredPlayerBarRenderer']
1544 ['chapters'],
1545 list)
1546 if not chapters_list:
1547 return
1548
1549 def chapter_time(chapter):
1550 return float_or_none(
1551 try_get(
1552 chapter,
1553 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1554 int),
1555 scale=1000)
1556 chapters = []
1557 for next_num, chapter in enumerate(chapters_list, start=1):
1558 start_time = chapter_time(chapter)
1559 if start_time is None:
1560 continue
1561 end_time = (chapter_time(chapters_list[next_num])
1562 if next_num < len(chapters_list) else duration)
1563 if end_time is None:
1564 continue
1565 title = try_get(
1566 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1567 compat_str)
1568 chapters.append({
1569 'start_time': start_time,
1570 'end_time': end_time,
1571 'title': title,
1572 })
1573 return chapters
1574
1575 @staticmethod
1576 def _extract_chapters_from_description(description, duration):
1577 if not description:
1578 return None
1579 chapter_lines = re.findall(
1580 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1581 description)
1582 if not chapter_lines:
1583 return None
1584 chapters = []
1585 for next_num, (chapter_line, time_point) in enumerate(
1586 chapter_lines, start=1):
1587 start_time = parse_duration(time_point)
1588 if start_time is None:
1589 continue
1590 if start_time > duration:
1591 break
1592 end_time = (duration if next_num == len(chapter_lines)
1593 else parse_duration(chapter_lines[next_num][1]))
1594 if end_time is None:
1595 continue
1596 if end_time > duration:
1597 end_time = duration
1598 if start_time > end_time:
1599 break
1600 chapter_title = re.sub(
1601 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1602 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1603 chapters.append({
1604 'start_time': start_time,
1605 'end_time': end_time,
1606 'title': chapter_title,
1607 })
1608 return chapters
1609
1610 def _extract_chapters(self, webpage, description, video_id, duration):
1611 return (self._extract_chapters_from_json(webpage, video_id, duration)
1612 or self._extract_chapters_from_description(description, duration))
1613
1614 def _real_extract(self, url):
1615 url, smuggled_data = unsmuggle_url(url, {})
1616
1617 proto = (
1618 'http' if self._downloader.params.get('prefer_insecure', False)
1619 else 'https')
1620
1621 start_time = None
1622 end_time = None
1623 parsed_url = compat_urllib_parse_urlparse(url)
1624 for component in [parsed_url.fragment, parsed_url.query]:
1625 query = compat_parse_qs(component)
1626 if start_time is None and 't' in query:
1627 start_time = parse_duration(query['t'][0])
1628 if start_time is None and 'start' in query:
1629 start_time = parse_duration(query['start'][0])
1630 if end_time is None and 'end' in query:
1631 end_time = parse_duration(query['end'][0])
1632
1633 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1634 mobj = re.search(self._NEXT_URL_RE, url)
1635 if mobj:
1636 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
1637 video_id = self.extract_id(url)
1638
1639 # Get video webpage
1640 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
1641 video_webpage, urlh = self._download_webpage_handle(url, video_id)
1642
1643 qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query)
1644 video_id = qs.get('v', [None])[0] or video_id
1645
1646 # Attempt to extract SWF player URL
1647 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1648 if mobj is not None:
1649 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1650 else:
1651 player_url = None
1652
1653 dash_mpds = []
1654
1655 def add_dash_mpd(video_info):
1656 dash_mpd = video_info.get('dashmpd')
1657 if dash_mpd and dash_mpd[0] not in dash_mpds:
1658 dash_mpds.append(dash_mpd[0])
1659
1660 def add_dash_mpd_pr(pl_response):
1661 dash_mpd = url_or_none(try_get(
1662 pl_response, lambda x: x['streamingData']['dashManifestUrl'],
1663 compat_str))
1664 if dash_mpd and dash_mpd not in dash_mpds:
1665 dash_mpds.append(dash_mpd)
1666
1667 is_live = None
1668 view_count = None
1669
1670 def extract_view_count(v_info):
1671 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1672
1673 def extract_player_response(player_response, video_id):
1674 pl_response = str_or_none(player_response)
1675 if not pl_response:
1676 return
1677 pl_response = self._parse_json(pl_response, video_id, fatal=False)
1678 if isinstance(pl_response, dict):
1679 add_dash_mpd_pr(pl_response)
1680 return pl_response
1681
1682 def extract_embedded_config(embed_webpage, video_id):
1683 embedded_config = self._search_regex(
1684 r'setConfig\(({.*})\);',
1685 embed_webpage, 'ytInitialData', default=None)
1686 if embedded_config:
1687 return embedded_config
1688
1689 player_response = {}
1690
1691 # Get video info
1692 video_info = {}
1693 embed_webpage = None
1694 if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+'
1695 or re.search(r'player-age-gate-content">', video_webpage) is not None):
1696 cookie_keys = self._get_cookies('https://www.youtube.com').keys()
1697 age_gate = True
1698 # We simulate the access to the video from www.youtube.com/v/{video_id}
1699 # this can be viewed without login into Youtube
1700 url = proto + '://www.youtube.com/embed/%s' % video_id
1701 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
1702 ext = extract_embedded_config(embed_webpage, video_id)
1703 # playabilityStatus = re.search(r'{\\\"status\\\":\\\"(?P<playabilityStatus>[^\"]+)\\\"', ext)
1704 playable_in_embed = re.search(r'{\\\"playableInEmbed\\\":(?P<playableinEmbed>[^\,]+)', ext)
1705 if not playable_in_embed:
1706 self.to_screen('Could not determine whether playabale in embed for video %s' % video_id)
1707 playable_in_embed = ''
1708 else:
1709 playable_in_embed = playable_in_embed.group('playableinEmbed')
1710 # check if video is only playable on youtube in other words not playable in embed - if so it requires auth (cookies)
1711 # if re.search(r'player-unavailable">', embed_webpage) is not None:
1712 if playable_in_embed == 'false':
1713 '''
1714 # TODO apply this patch when Support for Python 2.6(!) and above drops
1715 if ({'VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID'} <= cookie_keys
1716 or {'VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO'} <= cookie_keys):
1717 '''
1718 if (set(('VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID')) <= set(cookie_keys)
1719 or set(('VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO')) <= set(cookie_keys)):
1720 age_gate = False
1721 # Try looking directly into the video webpage
1722 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1723 if ytplayer_config:
1724 args = ytplayer_config.get("args")
1725 if args is not None:
1726 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1727 # Convert to the same format returned by compat_parse_qs
1728 video_info = dict((k, [v]) for k, v in args.items())
1729 add_dash_mpd(video_info)
1730 # Rental video is not rented but preview is available (e.g.
1731 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1732 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1733 if not video_info and args.get('ypc_vid'):
1734 return self.url_result(
1735 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1736 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1737 is_live = True
1738 if not player_response:
1739 player_response = extract_player_response(args.get('player_response'), video_id)
1740 elif not player_response:
1741 player_response = ytplayer_config
1742 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1743 add_dash_mpd_pr(player_response)
1744 else:
1745 raise ExtractorError('Video is age restricted and only playable on Youtube. Requires cookies!', expected=True)
1746 else:
1747 data = compat_urllib_parse_urlencode({
1748 'video_id': video_id,
1749 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1750 'sts': self._search_regex(
1751 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1752 })
1753 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1754 try:
1755 video_info_webpage = self._download_webpage(
1756 video_info_url, video_id,
1757 note='Refetching age-gated info webpage',
1758 errnote='unable to download video info webpage')
1759 except ExtractorError:
1760 video_info_webpage = None
1761 if video_info_webpage:
1762 video_info = compat_parse_qs(video_info_webpage)
1763 pl_response = video_info.get('player_response', [None])[0]
1764 player_response = extract_player_response(pl_response, video_id)
1765 add_dash_mpd(video_info)
1766 view_count = extract_view_count(video_info)
1767 else:
1768 age_gate = False
1769 # Try looking directly into the video webpage
1770 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1771 if ytplayer_config:
1772 args = ytplayer_config.get('args', {})
1773 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1774 # Convert to the same format returned by compat_parse_qs
1775 video_info = dict((k, [v]) for k, v in args.items())
1776 add_dash_mpd(video_info)
1777 # Rental video is not rented but preview is available (e.g.
1778 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1779 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1780 if not video_info and args.get('ypc_vid'):
1781 return self.url_result(
1782 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1783 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1784 is_live = True
1785 if not player_response:
1786 player_response = extract_player_response(args.get('player_response'), video_id)
1787 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1788 add_dash_mpd_pr(player_response)
1789
1790 if not video_info and not player_response:
1791 player_response = extract_player_response(
1792 self._search_regex(
1793 (r'%s\s*%s' % (self._YT_INITIAL_PLAYER_RESPONSE_RE, self._YT_INITIAL_BOUNDARY_RE),
1794 self._YT_INITIAL_PLAYER_RESPONSE_RE), video_webpage,
1795 'initial player response', default='{}'),
1796 video_id)
1797
1798 def extract_unavailable_message():
1799 messages = []
1800 for tag, kind in (('h1', 'message'), ('div', 'submessage')):
1801 msg = self._html_search_regex(
1802 r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
1803 video_webpage, 'unavailable %s' % kind, default=None)
1804 if msg:
1805 messages.append(msg)
1806 if messages:
1807 return '\n'.join(messages)
1808
1809 if not video_info and not player_response:
1810 unavailable_message = extract_unavailable_message()
1811 if not unavailable_message:
1812 unavailable_message = 'Unable to extract video data'
1813 raise ExtractorError(
1814 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
1815
1816 if not isinstance(video_info, dict):
1817 video_info = {}
1818
1819 video_details = try_get(
1820 player_response, lambda x: x['videoDetails'], dict) or {}
1821
1822 microformat = try_get(
1823 player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}
1824
1825 video_title = video_info.get('title', [None])[0] or video_details.get('title')
1826 if not video_title:
1827 self._downloader.report_warning('Unable to extract video title')
1828 video_title = '_'
1829
1830 description_original = video_description = get_element_by_id("eow-description", video_webpage)
1831 if video_description:
1832
1833 def replace_url(m):
1834 redir_url = compat_urlparse.urljoin(url, m.group(1))
1835 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1836 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1837 qs = compat_parse_qs(parsed_redir_url.query)
1838 q = qs.get('q')
1839 if q and q[0]:
1840 return q[0]
1841 return redir_url
1842
1843 description_original = video_description = re.sub(r'''(?x)
1844 <a\s+
1845 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1846 (?:title|href)="([^"]+)"\s+
1847 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1848 class="[^"]*"[^>]*>
1849 [^<]+\.{3}\s*
1850 </a>
1851 ''', replace_url, video_description)
1852 video_description = clean_html(video_description)
1853 else:
1854 video_description = video_details.get('shortDescription')
1855 if video_description is None:
1856 video_description = self._html_search_meta('description', video_webpage)
1857
1858 if not smuggled_data.get('force_singlefeed', False):
1859 if not self._downloader.params.get('noplaylist'):
1860 multifeed_metadata_list = try_get(
1861 player_response,
1862 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
1863 compat_str) or try_get(
1864 video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
1865 if multifeed_metadata_list:
1866 entries = []
1867 feed_ids = []
1868 for feed in multifeed_metadata_list.split(','):
1869 # Unquote should take place before split on comma (,) since textual
1870 # fields may contain comma as well (see
1871 # https://github.com/ytdl-org/youtube-dl/issues/8536)
1872 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
1873
1874 def feed_entry(name):
1875 return try_get(feed_data, lambda x: x[name][0], compat_str)
1876
1877 feed_id = feed_entry('id')
1878 if not feed_id:
1879 continue
1880 feed_title = feed_entry('title')
1881 title = video_title
1882 if feed_title:
1883 title += ' (%s)' % feed_title
1884 entries.append({
1885 '_type': 'url_transparent',
1886 'ie_key': 'Youtube',
1887 'url': smuggle_url(
1888 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1889 {'force_singlefeed': True}),
1890 'title': title,
1891 })
1892 feed_ids.append(feed_id)
1893 self.to_screen(
1894 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1895 % (', '.join(feed_ids), video_id))
1896 return self.playlist_result(entries, video_id, video_title, video_description)
1897 else:
1898 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1899
1900 if view_count is None:
1901 view_count = extract_view_count(video_info)
1902 if view_count is None and video_details:
1903 view_count = int_or_none(video_details.get('viewCount'))
1904 if view_count is None and microformat:
1905 view_count = int_or_none(microformat.get('viewCount'))
1906
1907 if is_live is None:
1908 is_live = bool_or_none(video_details.get('isLive'))
1909
1910 has_live_chat_replay = False
1911 if not is_live:
1912 yt_initial_data = self._get_yt_initial_data(video_id, video_webpage)
1913 try:
1914 yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
1915 has_live_chat_replay = True
1916 except (KeyError, IndexError, TypeError):
1917 pass
1918
1919 # Check for "rental" videos
1920 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1921 raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
1922
1923 def _extract_filesize(media_url):
1924 return int_or_none(self._search_regex(
1925 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
1926
1927 streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
1928 streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
1929
1930 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1931 self.report_rtmp_download()
1932 formats = [{
1933 'format_id': '_rtmp',
1934 'protocol': 'rtmp',
1935 'url': video_info['conn'][0],
1936 'player_url': player_url,
1937 }]
1938 elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
1939 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
1940 if 'rtmpe%3Dyes' in encoded_url_map:
1941 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
1942 formats = []
1943 formats_spec = {}
1944 fmt_list = video_info.get('fmt_list', [''])[0]
1945 if fmt_list:
1946 for fmt in fmt_list.split(','):
1947 spec = fmt.split('/')
1948 if len(spec) > 1:
1949 width_height = spec[1].split('x')
1950 if len(width_height) == 2:
1951 formats_spec[spec[0]] = {
1952 'resolution': spec[1],
1953 'width': int_or_none(width_height[0]),
1954 'height': int_or_none(width_height[1]),
1955 }
1956 for fmt in streaming_formats:
1957 itag = str_or_none(fmt.get('itag'))
1958 if not itag:
1959 continue
1960 quality = fmt.get('quality')
1961 quality_label = fmt.get('qualityLabel') or quality
1962 formats_spec[itag] = {
1963 'asr': int_or_none(fmt.get('audioSampleRate')),
1964 'filesize': int_or_none(fmt.get('contentLength')),
1965 'format_note': quality_label,
1966 'fps': int_or_none(fmt.get('fps')),
1967 'height': int_or_none(fmt.get('height')),
1968 # bitrate for itag 43 is always 2147483647
1969 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
1970 'width': int_or_none(fmt.get('width')),
1971 }
1972
1973 for fmt in streaming_formats:
1974 if fmt.get('drmFamilies') or fmt.get('drm_families'):
1975 continue
1976 url = url_or_none(fmt.get('url'))
1977
1978 if not url:
1979 cipher = fmt.get('cipher') or fmt.get('signatureCipher')
1980 if not cipher:
1981 continue
1982 url_data = compat_parse_qs(cipher)
1983 url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
1984 if not url:
1985 continue
1986 else:
1987 cipher = None
1988 url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
1989
1990 stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
1991 # Unsupported FORMAT_STREAM_TYPE_OTF
1992 if stream_type == 3:
1993 continue
1994
1995 format_id = fmt.get('itag') or url_data['itag'][0]
1996 if not format_id:
1997 continue
1998 format_id = compat_str(format_id)
1999
2000 if cipher:
2001 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
2002 ASSETS_RE = (
2003 r'<script[^>]+\bsrc=("[^"]+")[^>]+\bname=["\']player_ias/base',
2004 r'"jsUrl"\s*:\s*("[^"]+")',
2005 r'"assets":.+?"js":\s*("[^"]+")')
2006 jsplayer_url_json = self._search_regex(
2007 ASSETS_RE,
2008 embed_webpage if age_gate else video_webpage,
2009 'JS player URL (1)', default=None)
2010 if not jsplayer_url_json and not age_gate:
2011 # We need the embed website after all
2012 if embed_webpage is None:
2013 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
2014 embed_webpage = self._download_webpage(
2015 embed_url, video_id, 'Downloading embed webpage')
2016 jsplayer_url_json = self._search_regex(
2017 ASSETS_RE, embed_webpage, 'JS player URL')
2018
2019 player_url = json.loads(jsplayer_url_json)
2020 if player_url is None:
2021 player_url_json = self._search_regex(
2022 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
2023 video_webpage, 'age gate player URL')
2024 player_url = json.loads(player_url_json)
2025
2026 if 'sig' in url_data:
2027 url += '&signature=' + url_data['sig'][0]
2028 elif 's' in url_data:
2029 encrypted_sig = url_data['s'][0]
2030
2031 if self._downloader.params.get('verbose'):
2032 if player_url is None:
2033 player_desc = 'unknown'
2034 else:
2035 player_type, player_version = self._extract_player_info(player_url)
2036 player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version)
2037 parts_sizes = self._signature_cache_id(encrypted_sig)
2038 self.to_screen('{%s} signature length %s, %s' %
2039 (format_id, parts_sizes, player_desc))
2040
2041 signature = self._decrypt_signature(
2042 encrypted_sig, video_id, player_url, age_gate)
2043 sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
2044 url += '&%s=%s' % (sp, signature)
2045 if 'ratebypass' not in url:
2046 url += '&ratebypass=yes'
2047
2048 dct = {
2049 'format_id': format_id,
2050 'url': url,
2051 'player_url': player_url,
2052 }
2053 if format_id in self._formats:
2054 dct.update(self._formats[format_id])
2055 if format_id in formats_spec:
2056 dct.update(formats_spec[format_id])
2057
2058 # Some itags are not included in DASH manifest thus corresponding formats will
2059 # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).
2060 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
2061 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
2062 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
2063
2064 if width is None:
2065 width = int_or_none(fmt.get('width'))
2066 if height is None:
2067 height = int_or_none(fmt.get('height'))
2068
2069 filesize = int_or_none(url_data.get(
2070 'clen', [None])[0]) or _extract_filesize(url)
2071
2072 quality = url_data.get('quality', [None])[0] or fmt.get('quality')
2073 quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
2074
2075 tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
2076 or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
2077 fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
2078
2079 more_fields = {
2080 'filesize': filesize,
2081 'tbr': tbr,
2082 'width': width,
2083 'height': height,
2084 'fps': fps,
2085 'format_note': quality_label or quality,
2086 }
2087 for key, value in more_fields.items():
2088 if value:
2089 dct[key] = value
2090 type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
2091 if type_:
2092 type_split = type_.split(';')
2093 kind_ext = type_split[0].split('/')
2094 if len(kind_ext) == 2:
2095 kind, _ = kind_ext
2096 dct['ext'] = mimetype2ext(type_split[0])
2097 if kind in ('audio', 'video'):
2098 codecs = None
2099 for mobj in re.finditer(
2100 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
2101 if mobj.group('key') == 'codecs':
2102 codecs = mobj.group('val')
2103 break
2104 if codecs:
2105 dct.update(parse_codecs(codecs))
2106 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
2107 dct['downloader_options'] = {
2108 # Youtube throttles chunks >~10M
2109 'http_chunk_size': 10485760,
2110 }
2111 formats.append(dct)
2112 else:
2113 manifest_url = (
2114 url_or_none(try_get(
2115 player_response,
2116 lambda x: x['streamingData']['hlsManifestUrl'],
2117 compat_str))
2118 or url_or_none(try_get(
2119 video_info, lambda x: x['hlsvp'][0], compat_str)))
2120 if manifest_url:
2121 formats = []
2122 m3u8_formats = self._extract_m3u8_formats(
2123 manifest_url, video_id, 'mp4', fatal=False)
2124 for a_format in m3u8_formats:
2125 itag = self._search_regex(
2126 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
2127 if itag:
2128 a_format['format_id'] = itag
2129 if itag in self._formats:
2130 dct = self._formats[itag].copy()
2131 dct.update(a_format)
2132 a_format = dct
2133 a_format['player_url'] = player_url
2134 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
2135 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
2136 if self._downloader.params.get('youtube_include_hls_manifest', True):
2137 formats.append(a_format)
2138 else:
2139 error_message = extract_unavailable_message()
2140 if not error_message:
2141 reason_list = try_get(
2142 player_response,
2143 lambda x: x['playabilityStatus']['errorScreen']['playerErrorMessageRenderer']['subreason']['runs'],
2144 list) or []
2145 for reason in reason_list:
2146 if not isinstance(reason, dict):
2147 continue
2148 reason_text = try_get(reason, lambda x: x['text'], compat_str)
2149 if reason_text:
2150 if not error_message:
2151 error_message = ''
2152 error_message += reason_text
2153 if error_message:
2154 error_message = clean_html(error_message)
2155 if not error_message:
2156 error_message = clean_html(try_get(
2157 player_response, lambda x: x['playabilityStatus']['reason'],
2158 compat_str))
2159 if not error_message:
2160 error_message = clean_html(
2161 try_get(video_info, lambda x: x['reason'][0], compat_str))
2162 if error_message:
2163 raise ExtractorError(error_message, expected=True)
2164 raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
2165
2166 # uploader
2167 video_uploader = try_get(
2168 video_info, lambda x: x['author'][0],
2169 compat_str) or str_or_none(video_details.get('author'))
2170 if video_uploader:
2171 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
2172 else:
2173 self._downloader.report_warning('unable to extract uploader name')
2174
2175 # uploader_id
2176 video_uploader_id = None
2177 video_uploader_url = None
2178 mobj = re.search(
2179 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
2180 video_webpage)
2181 if mobj is not None:
2182 video_uploader_id = mobj.group('uploader_id')
2183 video_uploader_url = mobj.group('uploader_url')
2184 else:
2185 owner_profile_url = url_or_none(microformat.get('ownerProfileUrl'))
2186 if owner_profile_url:
2187 video_uploader_id = self._search_regex(
2188 r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id',
2189 default=None)
2190 video_uploader_url = owner_profile_url
2191
2192 channel_id = (
2193 str_or_none(video_details.get('channelId'))
2194 or self._html_search_meta(
2195 'channelId', video_webpage, 'channel id', default=None)
2196 or self._search_regex(
2197 r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
2198 video_webpage, 'channel id', default=None, group='id'))
2199 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
2200
2201 thumbnails = []
2202 thumbnails_list = try_get(
2203 video_details, lambda x: x['thumbnail']['thumbnails'], list) or []
2204 for t in thumbnails_list:
2205 if not isinstance(t, dict):
2206 continue
2207 thumbnail_url = url_or_none(t.get('url'))
2208 if not thumbnail_url:
2209 continue
2210 thumbnails.append({
2211 'url': thumbnail_url,
2212 'width': int_or_none(t.get('width')),
2213 'height': int_or_none(t.get('height')),
2214 })
2215
2216 if not thumbnails:
2217 video_thumbnail = None
2218 # We try first to get a high quality image:
2219 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
2220 video_webpage, re.DOTALL)
2221 if m_thumb is not None:
2222 video_thumbnail = m_thumb.group(1)
2223 thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str)
2224 if thumbnail_url:
2225 video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url)
2226 if video_thumbnail:
2227 thumbnails.append({'url': video_thumbnail})
2228
2229 # upload date
2230 upload_date = self._html_search_meta(
2231 'datePublished', video_webpage, 'upload date', default=None)
2232 if not upload_date:
2233 upload_date = self._search_regex(
2234 [r'(?s)id="eow-date.*?>(.*?)</span>',
2235 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
2236 video_webpage, 'upload date', default=None)
2237 if not upload_date:
2238 upload_date = microformat.get('publishDate') or microformat.get('uploadDate')
2239 upload_date = unified_strdate(upload_date)
2240
2241 video_license = self._html_search_regex(
2242 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
2243 video_webpage, 'license', default=None)
2244
2245 m_music = re.search(
2246 r'''(?x)
2247 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
2248 <ul[^>]*>\s*
2249 <li>(?P<title>.+?)
2250 by (?P<creator>.+?)
2251 (?:
2252 \(.+?\)|
2253 <a[^>]*
2254 (?:
2255 \bhref=["\']/red[^>]*>| # drop possible
2256 >\s*Listen ad-free with YouTube Red # YouTube Red ad
2257 )
2258 .*?
2259 )?</li
2260 ''',
2261 video_webpage)
2262 if m_music:
2263 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
2264 video_creator = clean_html(m_music.group('creator'))
2265 else:
2266 video_alt_title = video_creator = None
2267
2268 def extract_meta(field):
2269 return self._html_search_regex(
2270 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
2271 video_webpage, field, default=None)
2272
2273 track = extract_meta('Song')
2274 artist = extract_meta('Artist')
2275 album = extract_meta('Album')
2276
2277 # Youtube Music Auto-generated description
2278 release_date = release_year = None
2279 if video_description:
2280 mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description)
2281 if mobj:
2282 if not track:
2283 track = mobj.group('track').strip()
2284 if not artist:
2285 artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))
2286 if not album:
2287 album = mobj.group('album'.strip())
2288 release_year = mobj.group('release_year')
2289 release_date = mobj.group('release_date')
2290 if release_date:
2291 release_date = release_date.replace('-', '')
2292 if not release_year:
2293 release_year = int(release_date[:4])
2294 if release_year:
2295 release_year = int(release_year)
2296
2297 yt_initial_data = self._extract_yt_initial_data(video_id, video_webpage)
2298 contents = try_get(yt_initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list) or []
2299 for content in contents:
2300 rows = try_get(content, lambda x: x['videoSecondaryInfoRenderer']['metadataRowContainer']['metadataRowContainerRenderer']['rows'], list) or []
2301 multiple_songs = False
2302 for row in rows:
2303 if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True:
2304 multiple_songs = True
2305 break
2306 for row in rows:
2307 mrr = row.get('metadataRowRenderer') or {}
2308 mrr_title = try_get(
2309 mrr, lambda x: x['title']['simpleText'], compat_str)
2310 mrr_contents = try_get(
2311 mrr, lambda x: x['contents'][0], dict) or {}
2312 mrr_contents_text = try_get(mrr_contents, [lambda x: x['simpleText'], lambda x: x['runs'][0]['text']], compat_str)
2313 if not (mrr_title and mrr_contents_text):
2314 continue
2315 if mrr_title == 'License':
2316 video_license = mrr_contents_text
2317 elif not multiple_songs:
2318 if mrr_title == 'Album':
2319 album = mrr_contents_text
2320 elif mrr_title == 'Artist':
2321 artist = mrr_contents_text
2322 elif mrr_title == 'Song':
2323 track = mrr_contents_text
2324
2325 m_episode = re.search(
2326 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
2327 video_webpage)
2328 if m_episode:
2329 series = unescapeHTML(m_episode.group('series'))
2330 season_number = int(m_episode.group('season'))
2331 episode_number = int(m_episode.group('episode'))
2332 else:
2333 series = season_number = episode_number = None
2334
2335 m_cat_container = self._search_regex(
2336 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
2337 video_webpage, 'categories', default=None)
2338 category = None
2339 if m_cat_container:
2340 category = self._html_search_regex(
2341 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
2342 default=None)
2343 if not category:
2344 category = try_get(
2345 microformat, lambda x: x['category'], compat_str)
2346 video_categories = None if category is None else [category]
2347
2348 video_tags = [
2349 unescapeHTML(m.group('content'))
2350 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
2351 if not video_tags:
2352 video_tags = try_get(video_details, lambda x: x['keywords'], list)
2353
2354 def _extract_count(count_name):
2355 return str_to_int(self._search_regex(
2356 (r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' % re.escape(count_name),
2357 r'["\']label["\']\s*:\s*["\']([\d,.]+)\s+%ss["\']' % re.escape(count_name)),
2358 video_webpage, count_name, default=None))
2359
2360 like_count = _extract_count('like')
2361 dislike_count = _extract_count('dislike')
2362
2363 if view_count is None:
2364 view_count = str_to_int(self._search_regex(
2365 r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
2366 'view count', default=None))
2367
2368 average_rating = (
2369 float_or_none(video_details.get('averageRating'))
2370 or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
2371
2372 # subtitles
2373 video_subtitles = self.extract_subtitles(
2374 video_id, video_webpage, has_live_chat_replay)
2375 automatic_captions = self.extract_automatic_captions(video_id, player_response, ytplayer_config)
2376
2377 video_duration = try_get(
2378 video_info, lambda x: int_or_none(x['length_seconds'][0]))
2379 if not video_duration:
2380 video_duration = int_or_none(video_details.get('lengthSeconds'))
2381 if not video_duration:
2382 video_duration = parse_duration(self._html_search_meta(
2383 'duration', video_webpage, 'video duration'))
2384
2385 # Get Subscriber Count of channel
2386 subscriber_count = parse_count(self._search_regex(
2387 r'"text":"([\d\.]+\w?) subscribers"',
2388 video_webpage,
2389 'subscriber count',
2390 default=None
2391 ))
2392
2393 # annotations
2394 video_annotations = None
2395 if self._downloader.params.get('writeannotations', False):
2396 xsrf_token = None
2397 ytcfg = self._extract_ytcfg(video_id, video_webpage)
2398 if ytcfg:
2399 xsrf_token = try_get(ytcfg, lambda x: x['XSRF_TOKEN'], compat_str)
2400 if not xsrf_token:
2401 xsrf_token = self._search_regex(
2402 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>(?:(?!\2).)+)\2',
2403 video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
2404 invideo_url = try_get(
2405 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
2406 if xsrf_token and invideo_url:
2407 xsrf_field_name = None
2408 if ytcfg:
2409 xsrf_field_name = try_get(ytcfg, lambda x: x['XSRF_FIELD_NAME'], compat_str)
2410 if not xsrf_field_name:
2411 xsrf_field_name = self._search_regex(
2412 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
2413 video_webpage, 'xsrf field name',
2414 group='xsrf_field_name', default='session_token')
2415 video_annotations = self._download_webpage(
2416 self._proto_relative_url(invideo_url),
2417 video_id, note='Downloading annotations',
2418 errnote='Unable to download video annotations', fatal=False,
2419 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
2420
2421 chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
2422
2423 # Look for the DASH manifest
2424 if self._downloader.params.get('youtube_include_dash_manifest', True):
2425 dash_mpd_fatal = True
2426 for mpd_url in dash_mpds:
2427 dash_formats = {}
2428 try:
2429 def decrypt_sig(mobj):
2430 s = mobj.group(1)
2431 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2432 return '/signature/%s' % dec_s
2433
2434 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2435
2436 for df in self._extract_mpd_formats(
2437 mpd_url, video_id, fatal=dash_mpd_fatal,
2438 formats_dict=self._formats):
2439 if not df.get('filesize'):
2440 df['filesize'] = _extract_filesize(df['url'])
2441 # Do not overwrite DASH format found in some previous DASH manifest
2442 if df['format_id'] not in dash_formats:
2443 dash_formats[df['format_id']] = df
2444 # Additional DASH manifests may end up in HTTP Error 403 therefore
2445 # allow them to fail without bug report message if we already have
2446 # some DASH manifest succeeded. This is temporary workaround to reduce
2447 # burst of bug reports until we figure out the reason and whether it
2448 # can be fixed at all.
2449 dash_mpd_fatal = False
2450 except (ExtractorError, KeyError) as e:
2451 self.report_warning(
2452 'Skipping DASH manifest: %r' % e, video_id)
2453 if dash_formats:
2454 # Remove the formats we found through non-DASH, they
2455 # contain less info and it can be wrong, because we use
2456 # fixed values (for example the resolution). See
2457 # https://github.com/ytdl-org/youtube-dl/issues/5774 for an
2458 # example.
2459 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
2460 formats.extend(dash_formats.values())
2461
2462 # Check for malformed aspect ratio
2463 stretched_m = re.search(
2464 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2465 video_webpage)
2466 if stretched_m:
2467 w = float(stretched_m.group('w'))
2468 h = float(stretched_m.group('h'))
2469 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2470 # We will only process correct ratios.
2471 if w > 0 and h > 0:
2472 ratio = w / h
2473 for f in formats:
2474 if f.get('vcodec') != 'none':
2475 f['stretched_ratio'] = ratio
2476
2477 if not formats:
2478 if 'reason' in video_info:
2479 if 'The uploader has not made this video available in your country.' in video_info['reason']:
2480 regions_allowed = self._html_search_meta(
2481 'regionsAllowed', video_webpage, default=None)
2482 countries = regions_allowed.split(',') if regions_allowed else None
2483 self.raise_geo_restricted(
2484 msg=video_info['reason'][0], countries=countries)
2485 reason = video_info['reason'][0]
2486 if 'Invalid parameters' in reason:
2487 unavailable_message = extract_unavailable_message()
2488 if unavailable_message:
2489 reason = unavailable_message
2490 raise ExtractorError(
2491 'YouTube said: %s' % reason,
2492 expected=True, video_id=video_id)
2493 if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']):
2494 raise ExtractorError('This video is DRM protected.', expected=True)
2495
2496 self._sort_formats(formats)
2497
2498 self.mark_watched(video_id, video_info, player_response)
2499
2500 return {
2501 'id': video_id,
2502 'uploader': video_uploader,
2503 'uploader_id': video_uploader_id,
2504 'uploader_url': video_uploader_url,
2505 'channel_id': channel_id,
2506 'channel_url': channel_url,
2507 'upload_date': upload_date,
2508 'license': video_license,
2509 'creator': video_creator or artist,
2510 'title': video_title,
2511 'alt_title': video_alt_title or track,
2512 'thumbnails': thumbnails,
2513 'description': video_description,
2514 'categories': video_categories,
2515 'tags': video_tags,
2516 'subtitles': video_subtitles,
2517 'automatic_captions': automatic_captions,
2518 'duration': video_duration,
2519 'age_limit': 18 if age_gate else 0,
2520 'annotations': video_annotations,
2521 'chapters': chapters,
2522 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
2523 'view_count': view_count,
2524 'like_count': like_count,
2525 'dislike_count': dislike_count,
2526 'average_rating': average_rating,
2527 'formats': formats,
2528 'is_live': is_live,
2529 'start_time': start_time,
2530 'end_time': end_time,
2531 'series': series,
2532 'season_number': season_number,
2533 'episode_number': episode_number,
2534 'track': track,
2535 'artist': artist,
2536 'album': album,
2537 'release_date': release_date,
2538 'release_year': release_year,
2539 'subscriber_count': subscriber_count,
2540 }
2541
2542
2543 class YoutubeTabIE(YoutubeBaseInfoExtractor):
2544 IE_DESC = 'YouTube.com tab'
2545 _VALID_URL = r'''(?x)
2546 https?://
2547 (?:\w+\.)?
2548 (?:
2549 youtube(?:kids)?\.com|
2550 invidio\.us
2551 )/
2552 (?:
2553 (?:channel|c|user)/|
2554 (?P<not_channel>
2555 feed/|
2556 (?:playlist|watch)\?.*?\blist=
2557 )|
2558 (?!(?:%s)\b) # Direct URLs
2559 )
2560 (?P<id>[^/?\#&]+)
2561 ''' % YoutubeBaseInfoExtractor._RESERVED_NAMES
2562 IE_NAME = 'youtube:tab'
2563
2564 _TESTS = [{
2565 # playlists, multipage
2566 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid',
2567 'playlist_mincount': 94,
2568 'info_dict': {
2569 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2570 'title': 'Игорь Клейнер - Playlists',
2571 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
2572 },
2573 }, {
2574 # playlists, multipage, different order
2575 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
2576 'playlist_mincount': 94,
2577 'info_dict': {
2578 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2579 'title': 'Игорь Клейнер - Playlists',
2580 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
2581 },
2582 }, {
2583 # playlists, singlepage
2584 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
2585 'playlist_mincount': 4,
2586 'info_dict': {
2587 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
2588 'title': 'ThirstForScience - Playlists',
2589 'description': 'md5:609399d937ea957b0f53cbffb747a14c',
2590 }
2591 }, {
2592 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
2593 'only_matching': True,
2594 }, {
2595 # basic, single video playlist
2596 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2597 'info_dict': {
2598 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2599 'uploader': 'Sergey M.',
2600 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2601 'title': 'youtube-dl public playlist',
2602 },
2603 'playlist_count': 1,
2604 }, {
2605 # empty playlist
2606 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2607 'info_dict': {
2608 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2609 'uploader': 'Sergey M.',
2610 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2611 'title': 'youtube-dl empty playlist',
2612 },
2613 'playlist_count': 0,
2614 }, {
2615 # Home tab
2616 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured',
2617 'info_dict': {
2618 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2619 'title': 'lex will - Home',
2620 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2621 },
2622 'playlist_mincount': 2,
2623 }, {
2624 # Videos tab
2625 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos',
2626 'info_dict': {
2627 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2628 'title': 'lex will - Videos',
2629 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2630 },
2631 'playlist_mincount': 975,
2632 }, {
2633 # Videos tab, sorted by popular
2634 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid',
2635 'info_dict': {
2636 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2637 'title': 'lex will - Videos',
2638 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2639 },
2640 'playlist_mincount': 199,
2641 }, {
2642 # Playlists tab
2643 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists',
2644 'info_dict': {
2645 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2646 'title': 'lex will - Playlists',
2647 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2648 },
2649 'playlist_mincount': 17,
2650 }, {
2651 # Community tab
2652 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community',
2653 'info_dict': {
2654 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2655 'title': 'lex will - Community',
2656 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2657 },
2658 'playlist_mincount': 18,
2659 }, {
2660 # Channels tab
2661 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels',
2662 'info_dict': {
2663 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2664 'title': 'lex will - Channels',
2665 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2666 },
2667 'playlist_mincount': 138,
2668 }, {
2669 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA',
2670 'only_matching': True,
2671 }, {
2672 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
2673 'only_matching': True,
2674 }, {
2675 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
2676 'only_matching': True,
2677 }, {
2678 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2679 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2680 'info_dict': {
2681 'title': '29C3: Not my department',
2682 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2683 'uploader': 'Christiaan008',
2684 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
2685 },
2686 'playlist_count': 96,
2687 }, {
2688 'note': 'Large playlist',
2689 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
2690 'info_dict': {
2691 'title': 'Uploads from Cauchemar',
2692 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
2693 'uploader': 'Cauchemar',
2694 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
2695 },
2696 'playlist_mincount': 1123,
2697 }, {
2698 # even larger playlist, 8832 videos
2699 'url': 'http://www.youtube.com/user/NASAgovVideo/videos',
2700 'only_matching': True,
2701 }, {
2702 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2703 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2704 'info_dict': {
2705 'title': 'Uploads from Interstellar Movie',
2706 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
2707 'uploader': 'Interstellar Movie',
2708 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
2709 },
2710 'playlist_mincount': 21,
2711 }, {
2712 # https://github.com/ytdl-org/youtube-dl/issues/21844
2713 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2714 'info_dict': {
2715 'title': 'Data Analysis with Dr Mike Pound',
2716 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2717 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
2718 'uploader': 'Computerphile',
2719 },
2720 'playlist_mincount': 11,
2721 }, {
2722 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2723 'only_matching': True,
2724 }, {
2725 # Playlist URL that does not actually serve a playlist
2726 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2727 'info_dict': {
2728 'id': 'FqZTN594JQw',
2729 'ext': 'webm',
2730 'title': "Smiley's People 01 detective, Adventure Series, Action",
2731 'uploader': 'STREEM',
2732 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
2733 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
2734 'upload_date': '20150526',
2735 'license': 'Standard YouTube License',
2736 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2737 'categories': ['People & Blogs'],
2738 'tags': list,
2739 'view_count': int,
2740 'like_count': int,
2741 'dislike_count': int,
2742 },
2743 'params': {
2744 'skip_download': True,
2745 },
2746 'skip': 'This video is not available.',
2747 'add_ie': [YoutubeIE.ie_key()],
2748 }, {
2749 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
2750 'only_matching': True,
2751 }, {
2752 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
2753 'only_matching': True,
2754 }, {
2755 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live',
2756 'info_dict': {
2757 'id': '9Auq9mYxFEE',
2758 'ext': 'mp4',
2759 'title': 'Watch Sky News live',
2760 'uploader': 'Sky News',
2761 'uploader_id': 'skynews',
2762 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews',
2763 'upload_date': '20191102',
2764 'description': 'md5:78de4e1c2359d0ea3ed829678e38b662',
2765 'categories': ['News & Politics'],
2766 'tags': list,
2767 'like_count': int,
2768 'dislike_count': int,
2769 },
2770 'params': {
2771 'skip_download': True,
2772 },
2773 }, {
2774 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
2775 'info_dict': {
2776 'id': 'a48o2S1cPoo',
2777 'ext': 'mp4',
2778 'title': 'The Young Turks - Live Main Show',
2779 'uploader': 'The Young Turks',
2780 'uploader_id': 'TheYoungTurks',
2781 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
2782 'upload_date': '20150715',
2783 'license': 'Standard YouTube License',
2784 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
2785 'categories': ['News & Politics'],
2786 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
2787 'like_count': int,
2788 'dislike_count': int,
2789 },
2790 'params': {
2791 'skip_download': True,
2792 },
2793 'only_matching': True,
2794 }, {
2795 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
2796 'only_matching': True,
2797 }, {
2798 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
2799 'only_matching': True,
2800 }, {
2801 'url': 'https://www.youtube.com/feed/trending',
2802 'only_matching': True,
2803 }, {
2804 # needs auth
2805 'url': 'https://www.youtube.com/feed/library',
2806 'only_matching': True,
2807 }, {
2808 # needs auth
2809 'url': 'https://www.youtube.com/feed/history',
2810 'only_matching': True,
2811 }, {
2812 # needs auth
2813 'url': 'https://www.youtube.com/feed/subscriptions',
2814 'only_matching': True,
2815 }, {
2816 # needs auth
2817 'url': 'https://www.youtube.com/feed/watch_later',
2818 'only_matching': True,
2819 }, {
2820 # no longer available?
2821 'url': 'https://www.youtube.com/feed/recommended',
2822 'only_matching': True,
2823 }, {
2824 # inline playlist with not always working continuations
2825 'url': 'https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C',
2826 'only_matching': True,
2827 }, {
2828 'url': 'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8',
2829 'only_matching': True,
2830 }, {
2831 'url': 'https://www.youtube.com/course',
2832 'only_matching': True,
2833 }, {
2834 'url': 'https://www.youtube.com/zsecurity',
2835 'only_matching': True,
2836 }, {
2837 'url': 'http://www.youtube.com/NASAgovVideo/videos',
2838 'only_matching': True,
2839 }, {
2840 'url': 'https://www.youtube.com/TheYoungTurks/live',
2841 'only_matching': True,
2842 }]
2843
2844 @classmethod
2845 def suitable(cls, url):
2846 return False if YoutubeIE.suitable(url) else super(
2847 YoutubeTabIE, cls).suitable(url)
2848
2849 def _extract_channel_id(self, webpage):
2850 channel_id = self._html_search_meta(
2851 'channelId', webpage, 'channel id', default=None)
2852 if channel_id:
2853 return channel_id
2854 channel_url = self._html_search_meta(
2855 ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url',
2856 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad',
2857 'twitter:app:url:googleplay'), webpage, 'channel url')
2858 return self._search_regex(
2859 r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+',
2860 channel_url, 'channel id')
2861
2862 @staticmethod
2863 def _extract_grid_item_renderer(item):
2864 for item_kind in ('Playlist', 'Video', 'Channel'):
2865 renderer = item.get('grid%sRenderer' % item_kind)
2866 if renderer:
2867 return renderer
2868
2869 def _extract_video(self, renderer):
2870 video_id = renderer.get('videoId')
2871 title = try_get(
2872 renderer,
2873 (lambda x: x['title']['runs'][0]['text'],
2874 lambda x: x['title']['simpleText']), compat_str)
2875 description = try_get(
2876 renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'],
2877 compat_str)
2878 duration = parse_duration(try_get(
2879 renderer, lambda x: x['lengthText']['simpleText'], compat_str))
2880 view_count_text = try_get(
2881 renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
2882 view_count = str_to_int(self._search_regex(
2883 r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
2884 'view count', default=None))
2885 uploader = try_get(
2886 renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
2887 return {
2888 '_type': 'url_transparent',
2889 'ie_key': YoutubeIE.ie_key(),
2890 'id': video_id,
2891 'url': video_id,
2892 'title': title,
2893 'description': description,
2894 'duration': duration,
2895 'view_count': view_count,
2896 'uploader': uploader,
2897 }
2898
2899 def _grid_entries(self, grid_renderer):
2900 for item in grid_renderer['items']:
2901 if not isinstance(item, dict):
2902 continue
2903 renderer = self._extract_grid_item_renderer(item)
2904 if not isinstance(renderer, dict):
2905 continue
2906 title = try_get(
2907 renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
2908 # playlist
2909 playlist_id = renderer.get('playlistId')
2910 if playlist_id:
2911 yield self.url_result(
2912 'https://www.youtube.com/playlist?list=%s' % playlist_id,
2913 ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
2914 video_title=title)
2915 # video
2916 video_id = renderer.get('videoId')
2917 if video_id:
2918 yield self._extract_video(renderer)
2919 # channel
2920 channel_id = renderer.get('channelId')
2921 if channel_id:
2922 title = try_get(
2923 renderer, lambda x: x['title']['simpleText'], compat_str)
2924 yield self.url_result(
2925 'https://www.youtube.com/channel/%s' % channel_id,
2926 ie=YoutubeTabIE.ie_key(), video_title=title)
2927
2928 def _shelf_entries_from_content(self, shelf_renderer):
2929 content = shelf_renderer.get('content')
2930 if not isinstance(content, dict):
2931 return
2932 renderer = content.get('gridRenderer')
2933 if renderer:
2934 # TODO: add support for nested playlists so each shelf is processed
2935 # as separate playlist
2936 # TODO: this includes only first N items
2937 for entry in self._grid_entries(renderer):
2938 yield entry
2939 renderer = content.get('horizontalListRenderer')
2940 if renderer:
2941 # TODO
2942 pass
2943
2944 def _shelf_entries(self, shelf_renderer, skip_channels=False):
2945 ep = try_get(
2946 shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
2947 compat_str)
2948 shelf_url = urljoin('https://www.youtube.com', ep)
2949 if shelf_url:
2950 # Skipping links to another channels, note that checking for
2951 # endpoint.commandMetadata.webCommandMetadata.webPageTypwebPageType == WEB_PAGE_TYPE_CHANNEL
2952 # will not work
2953 if skip_channels and '/channels?' in shelf_url:
2954 return
2955 title = try_get(
2956 shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
2957 yield self.url_result(shelf_url, video_title=title)
2958 # Shelf may not contain shelf URL, fallback to extraction from content
2959 for entry in self._shelf_entries_from_content(shelf_renderer):
2960 yield entry
2961
2962 def _playlist_entries(self, video_list_renderer):
2963 for content in video_list_renderer['contents']:
2964 if not isinstance(content, dict):
2965 continue
2966 renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer')
2967 if not isinstance(renderer, dict):
2968 continue
2969 video_id = renderer.get('videoId')
2970 if not video_id:
2971 continue
2972 yield self._extract_video(renderer)
2973
2974 r""" # Not needed in the new implementation
2975 def _itemSection_entries(self, item_sect_renderer):
2976 for content in item_sect_renderer['contents']:
2977 if not isinstance(content, dict):
2978 continue
2979 renderer = content.get('videoRenderer', {})
2980 if not isinstance(renderer, dict):
2981 continue
2982 video_id = renderer.get('videoId')
2983 if not video_id:
2984 continue
2985 yield self._extract_video(renderer)
2986 """
2987
2988 def _rich_entries(self, rich_grid_renderer):
2989 renderer = try_get(
2990 rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) or {}
2991 video_id = renderer.get('videoId')
2992 if not video_id:
2993 return
2994 yield self._extract_video(renderer)
2995
2996 def _video_entry(self, video_renderer):
2997 video_id = video_renderer.get('videoId')
2998 if video_id:
2999 return self._extract_video(video_renderer)
3000
3001 def _post_thread_entries(self, post_thread_renderer):
3002 post_renderer = try_get(
3003 post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict)
3004 if not post_renderer:
3005 return
3006 # video attachment
3007 video_renderer = try_get(
3008 post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict)
3009 video_id = None
3010 if video_renderer:
3011 entry = self._video_entry(video_renderer)
3012 if entry:
3013 yield entry
3014 # inline video links
3015 runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or []
3016 for run in runs:
3017 if not isinstance(run, dict):
3018 continue
3019 ep_url = try_get(
3020 run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str)
3021 if not ep_url:
3022 continue
3023 if not YoutubeIE.suitable(ep_url):
3024 continue
3025 ep_video_id = YoutubeIE._match_id(ep_url)
3026 if video_id == ep_video_id:
3027 continue
3028 yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=video_id)
3029
3030 def _post_thread_continuation_entries(self, post_thread_continuation):
3031 contents = post_thread_continuation.get('contents')
3032 if not isinstance(contents, list):
3033 return
3034 for content in contents:
3035 renderer = content.get('backstagePostThreadRenderer')
3036 if not isinstance(renderer, dict):
3037 continue
3038 for entry in self._post_thread_entries(renderer):
3039 yield entry
3040
3041 @staticmethod
3042 def _build_continuation_query(continuation, ctp=None):
3043 query = {
3044 'ctoken': continuation,
3045 'continuation': continuation,
3046 }
3047 if ctp:
3048 query['itct'] = ctp
3049 return query
3050
3051 @staticmethod
3052 def _extract_next_continuation_data(renderer):
3053 next_continuation = try_get(
3054 renderer, lambda x: x['continuations'][0]['nextContinuationData'], dict)
3055 if not next_continuation:
3056 return
3057 continuation = next_continuation.get('continuation')
3058 if not continuation:
3059 return
3060 ctp = next_continuation.get('clickTrackingParams')
3061 return YoutubeTabIE._build_continuation_query(continuation, ctp)
3062
3063 @classmethod
3064 def _extract_continuation(cls, renderer):
3065 next_continuation = cls._extract_next_continuation_data(renderer)
3066 if next_continuation:
3067 return next_continuation
3068 contents = renderer.get('contents')
3069 if not isinstance(contents, list):
3070 return
3071 for content in contents:
3072 if not isinstance(content, dict):
3073 continue
3074 continuation_ep = try_get(
3075 content, lambda x: x['continuationItemRenderer']['continuationEndpoint'],
3076 dict)
3077 if not continuation_ep:
3078 continue
3079 continuation = try_get(
3080 continuation_ep, lambda x: x['continuationCommand']['token'], compat_str)
3081 if not continuation:
3082 continue
3083 ctp = continuation_ep.get('clickTrackingParams')
3084 return YoutubeTabIE._build_continuation_query(continuation, ctp)
3085
3086 def _entries(self, tab, identity_token):
3087
3088 def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds
3089 contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
3090 for content in contents:
3091 if not isinstance(content, dict):
3092 continue
3093 is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict)
3094 if not is_renderer:
3095 renderer = content.get('richItemRenderer')
3096 if renderer:
3097 for entry in self._rich_entries(renderer):
3098 yield entry
3099 continuation_list[0] = self._extract_continuation(parent_renderer)
3100 continue
3101 isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
3102 for isr_content in isr_contents:
3103 if not isinstance(isr_content, dict):
3104 continue
3105 renderer = isr_content.get('playlistVideoListRenderer')
3106 if renderer:
3107 for entry in self._playlist_entries(renderer):
3108 yield entry
3109 continuation_list[0] = self._extract_continuation(renderer)
3110 continue
3111 renderer = isr_content.get('gridRenderer')
3112 if renderer:
3113 for entry in self._grid_entries(renderer):
3114 yield entry
3115 continuation_list[0] = self._extract_continuation(renderer)
3116 continue
3117 renderer = isr_content.get('shelfRenderer')
3118 if renderer:
3119 is_channels_tab = tab.get('title') == 'Channels'
3120 for entry in self._shelf_entries(renderer, not is_channels_tab):
3121 yield entry
3122 continue
3123 renderer = isr_content.get('backstagePostThreadRenderer')
3124 if renderer:
3125 for entry in self._post_thread_entries(renderer):
3126 yield entry
3127 continuation_list[0] = self._extract_continuation(renderer)
3128 continue
3129 renderer = isr_content.get('videoRenderer')
3130 if renderer:
3131 entry = self._video_entry(renderer)
3132 if entry:
3133 yield entry
3134
3135 if not continuation_list[0]:
3136 continuation_list[0] = self._extract_continuation(is_renderer)
3137
3138 if not continuation_list[0]:
3139 continuation_list[0] = self._extract_continuation(parent_renderer)
3140
3141 continuation_list = [None] # Python 2 doesnot support nonlocal
3142 tab_content = try_get(tab, lambda x: x['content'], dict)
3143 if not tab_content:
3144 return
3145 parent_renderer = (
3146 try_get(tab_content, lambda x: x['sectionListRenderer'], dict)
3147 or try_get(tab_content, lambda x: x['richGridRenderer'], dict) or {})
3148 for entry in extract_entries(parent_renderer):
3149 yield entry
3150 continuation = continuation_list[0]
3151
3152 headers = {
3153 'x-youtube-client-name': '1',
3154 'x-youtube-client-version': '2.20201112.04.01',
3155 }
3156 if identity_token:
3157 headers['x-youtube-identity-token'] = identity_token
3158
3159 for page_num in itertools.count(1):
3160 if not continuation:
3161 break
3162 count = 0
3163 retries = 3
3164 while count <= retries:
3165 try:
3166 # Downloading page may result in intermittent 5xx HTTP error
3167 # that is usually worked around with a retry
3168 browse = self._download_json(
3169 'https://www.youtube.com/browse_ajax', None,
3170 'Downloading page %d%s'
3171 % (page_num, ' (retry #%d)' % count if count else ''),
3172 headers=headers, query=continuation)
3173 break
3174 except ExtractorError as e:
3175 if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
3176 count += 1
3177 if count <= retries:
3178 continue
3179 raise
3180 if not browse:
3181 break
3182 response = try_get(browse, lambda x: x[1]['response'], dict)
3183 if not response:
3184 break
3185
3186 continuation_contents = try_get(
3187 response, lambda x: x['continuationContents'], dict)
3188 if continuation_contents:
3189 continuation_renderer = continuation_contents.get('playlistVideoListContinuation')
3190 if continuation_renderer:
3191 for entry in self._playlist_entries(continuation_renderer):
3192 yield entry
3193 continuation = self._extract_continuation(continuation_renderer)
3194 continue
3195 continuation_renderer = continuation_contents.get('gridContinuation')
3196 if continuation_renderer:
3197 for entry in self._grid_entries(continuation_renderer):
3198 yield entry
3199 continuation = self._extract_continuation(continuation_renderer)
3200 continue
3201 continuation_renderer = continuation_contents.get('itemSectionContinuation')
3202 if continuation_renderer:
3203 for entry in self._post_thread_continuation_entries(continuation_renderer):
3204 yield entry
3205 continuation = self._extract_continuation(continuation_renderer)
3206 continue
3207 continuation_renderer = continuation_contents.get('sectionListContinuation') # for feeds
3208 if continuation_renderer:
3209 continuation_list = [None]
3210 for entry in extract_entries(continuation_renderer):
3211 yield entry
3212 continuation = continuation_list[0]
3213 continue
3214
3215 continuation_items = try_get(
3216 response, lambda x: x['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems'], list)
3217 if continuation_items:
3218 continuation_item = continuation_items[0]
3219 if not isinstance(continuation_item, dict):
3220 continue
3221 renderer = continuation_item.get('playlistVideoRenderer') or continuation_item.get('itemSectionRenderer')
3222 if renderer:
3223 video_list_renderer = {'contents': continuation_items}
3224 for entry in self._playlist_entries(video_list_renderer):
3225 yield entry
3226 continuation = self._extract_continuation(video_list_renderer)
3227 continue
3228 break
3229
3230 @staticmethod
3231 def _extract_selected_tab(tabs):
3232 for tab in tabs:
3233 if try_get(tab, lambda x: x['tabRenderer']['selected'], bool):
3234 return tab['tabRenderer']
3235 else:
3236 raise ExtractorError('Unable to find selected tab')
3237
3238 @staticmethod
3239 def _extract_uploader(data):
3240 uploader = {}
3241 sidebar_renderer = try_get(
3242 data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list)
3243 if sidebar_renderer:
3244 for item in sidebar_renderer:
3245 if not isinstance(item, dict):
3246 continue
3247 renderer = item.get('playlistSidebarSecondaryInfoRenderer')
3248 if not isinstance(renderer, dict):
3249 continue
3250 owner = try_get(
3251 renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict)
3252 if owner:
3253 uploader['uploader'] = owner.get('text')
3254 uploader['uploader_id'] = try_get(
3255 owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str)
3256 uploader['uploader_url'] = urljoin(
3257 'https://www.youtube.com/',
3258 try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
3259 return uploader
3260
3261 def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
3262 selected_tab = self._extract_selected_tab(tabs)
3263 renderer = try_get(
3264 data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
3265 playlist_id = title = description = None
3266 if renderer:
3267 channel_title = renderer.get('title') or item_id
3268 tab_title = selected_tab.get('title')
3269 title = channel_title or item_id
3270 if tab_title:
3271 title += ' - %s' % tab_title
3272 description = renderer.get('description')
3273 playlist_id = renderer.get('externalId')
3274 renderer = try_get(
3275 data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
3276 if renderer:
3277 title = renderer.get('title')
3278 description = None
3279 playlist_id = item_id
3280 if playlist_id is None:
3281 playlist_id = item_id
3282 if title is None:
3283 title = "Youtube " + playlist_id.title()
3284 playlist = self.playlist_result(
3285 self._entries(selected_tab, identity_token),
3286 playlist_id=playlist_id, playlist_title=title,
3287 playlist_description=description)
3288 playlist.update(self._extract_uploader(data))
3289 return playlist
3290
3291 def _extract_from_playlist(self, item_id, url, data, playlist):
3292 title = playlist.get('title') or try_get(
3293 data, lambda x: x['titleText']['simpleText'], compat_str)
3294 playlist_id = playlist.get('playlistId') or item_id
3295 # Inline playlist rendition continuation does not always work
3296 # at Youtube side, so delegating regular tab-based playlist URL
3297 # processing whenever possible.
3298 playlist_url = urljoin(url, try_get(
3299 playlist, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
3300 compat_str))
3301 if playlist_url and playlist_url != url:
3302 return self.url_result(
3303 playlist_url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
3304 video_title=title)
3305 return self.playlist_result(
3306 self._playlist_entries(playlist), playlist_id=playlist_id,
3307 playlist_title=title)
3308
3309 @staticmethod
3310 def _extract_alerts(data):
3311 for alert_dict in try_get(data, lambda x: x['alerts'], list) or []:
3312 if not isinstance(alert_dict, dict):
3313 continue
3314 for renderer in alert_dict:
3315 alert = alert_dict[renderer]
3316 alert_type = alert.get('type')
3317 if not alert_type:
3318 continue
3319 message = try_get(alert, lambda x: x['text']['simpleText'], compat_str)
3320 if message:
3321 yield alert_type, message
3322 for run in try_get(alert, lambda x: x['text']['runs'], list) or []:
3323 message = try_get(run, lambda x: x['text'], compat_str)
3324 if message:
3325 yield alert_type, message
3326
3327 def _extract_identity_token(self, webpage, item_id):
3328 ytcfg = self._extract_ytcfg(item_id, webpage)
3329 if ytcfg:
3330 token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str)
3331 if token:
3332 return token
3333 return self._search_regex(
3334 r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
3335 'identity token', default=None)
3336
3337 def _real_extract(self, url):
3338 item_id = self._match_id(url)
3339 url = compat_urlparse.urlunparse(
3340 compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
3341 is_home = re.match(r'(?P<pre>%s)(?P<post>/?(?![^#?]).*$)' % self._VALID_URL, url)
3342 if is_home is not None and is_home.group('not_channel') is None and item_id != 'feed':
3343 self._downloader.report_warning(
3344 'A channel/user page was given. All the channel\'s videos will be downloaded. '
3345 'To download only the videos in the home page, add a "/home" to the URL')
3346 url = '%s/videos%s' % (is_home.group('pre'), is_home.group('post') or '')
3347
3348 # Handle both video/playlist URLs
3349 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3350 video_id = qs.get('v', [None])[0]
3351 playlist_id = qs.get('list', [None])[0]
3352
3353 if is_home is not None and is_home.group('not_channel') is not None and is_home.group('not_channel').startswith('watch') and not video_id:
3354 if playlist_id:
3355 self._downloader.report_warning('%s is not a valid Youtube URL. Trying to download playlist %s' % (url, playlist_id))
3356 url = 'https://www.youtube.com/playlist?list=%s' % playlist_id
3357 # return self.url_result(playlist_id, ie=YoutubePlaylistIE.ie_key())
3358 else:
3359 raise ExtractorError('Unable to recognize tab page')
3360 if video_id and playlist_id:
3361 if self._downloader.params.get('noplaylist'):
3362 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
3363 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
3364 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
3365
3366 webpage = self._download_webpage(url, item_id)
3367 identity_token = self._extract_identity_token(webpage, item_id)
3368 data = self._extract_yt_initial_data(item_id, webpage)
3369 for alert_type, alert_message in self._extract_alerts(data):
3370 self._downloader.report_warning('YouTube said: %s - %s' % (alert_type, alert_message))
3371 tabs = try_get(
3372 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
3373 if tabs:
3374 return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token)
3375 playlist = try_get(
3376 data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
3377 if playlist:
3378 return self._extract_from_playlist(item_id, url, data, playlist)
3379 # Fallback to video extraction if no playlist alike page is recognized.
3380 # First check for the current video then try the v attribute of URL query.
3381 video_id = try_get(
3382 data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'],
3383 compat_str) or video_id
3384 if video_id:
3385 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
3386 # Failed to recognize
3387 raise ExtractorError('Unable to recognize tab page')
3388
3389
3390 class YoutubePlaylistIE(InfoExtractor):
3391 IE_DESC = 'YouTube.com playlists'
3392 _VALID_URL = r'''(?x)(?:
3393 (?:https?://)?
3394 (?:\w+\.)?
3395 (?:
3396 (?:
3397 youtube(?:kids)?\.com|
3398 invidio\.us
3399 )
3400 /.*?\?.*?\blist=
3401 )?
3402 (?P<id>%(playlist_id)s)
3403 )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
3404 IE_NAME = 'youtube:playlist'
3405 _TESTS = [{
3406 'note': 'issue #673',
3407 'url': 'PLBB231211A4F62143',
3408 'info_dict': {
3409 'title': '[OLD]Team Fortress 2 (Class-based LP)',
3410 'id': 'PLBB231211A4F62143',
3411 'uploader': 'Wickydoo',
3412 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
3413 },
3414 'playlist_mincount': 29,
3415 }, {
3416 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
3417 'info_dict': {
3418 'title': 'YDL_safe_search',
3419 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
3420 },
3421 'playlist_count': 2,
3422 'skip': 'This playlist is private',
3423 }, {
3424 'note': 'embedded',
3425 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
3426 'playlist_count': 4,
3427 'info_dict': {
3428 'title': 'JODA15',
3429 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
3430 'uploader': 'milan',
3431 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
3432 }
3433 }, {
3434 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
3435 'playlist_mincount': 982,
3436 'info_dict': {
3437 'title': '2018 Chinese New Singles (11/6 updated)',
3438 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
3439 'uploader': 'LBK',
3440 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
3441 }
3442 }, {
3443 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
3444 'only_matching': True,
3445 }, {
3446 # music album playlist
3447 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
3448 'only_matching': True,
3449 }]
3450
3451 @classmethod
3452 def suitable(cls, url):
3453 return False if YoutubeTabIE.suitable(url) else super(
3454 YoutubePlaylistIE, cls).suitable(url)
3455
3456 def _real_extract(self, url):
3457 playlist_id = self._match_id(url)
3458 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3459 if not qs:
3460 qs = {'list': playlist_id}
3461 return self.url_result(
3462 update_url_query('https://www.youtube.com/playlist', qs),
3463 ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
3464
3465
3466 class YoutubeYtBeIE(InfoExtractor):
3467 _VALID_URL = r'https?://youtu\.be/(?P<id>[0-9A-Za-z_-]{11})/*?.*?\blist=(?P<playlist_id>%(playlist_id)s)' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
3468 _TESTS = [{
3469 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
3470 'info_dict': {
3471 'id': 'yeWKywCrFtk',
3472 'ext': 'mp4',
3473 'title': 'Small Scale Baler and Braiding Rugs',
3474 'uploader': 'Backus-Page House Museum',
3475 'uploader_id': 'backuspagemuseum',
3476 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
3477 'upload_date': '20161008',
3478 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
3479 'categories': ['Nonprofits & Activism'],
3480 'tags': list,
3481 'like_count': int,
3482 'dislike_count': int,
3483 },
3484 'params': {
3485 'noplaylist': True,
3486 'skip_download': True,
3487 },
3488 }, {
3489 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
3490 'only_matching': True,
3491 }]
3492
3493 def _real_extract(self, url):
3494 mobj = re.match(self._VALID_URL, url)
3495 video_id = mobj.group('id')
3496 playlist_id = mobj.group('playlist_id')
3497 return self.url_result(
3498 update_url_query('https://www.youtube.com/watch', {
3499 'v': video_id,
3500 'list': playlist_id,
3501 'feature': 'youtu.be',
3502 }), ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
3503
3504
3505 class YoutubeYtUserIE(InfoExtractor):
3506 _VALID_URL = r'ytuser:(?P<id>.+)'
3507 _TESTS = [{
3508 'url': 'ytuser:phihag',
3509 'only_matching': True,
3510 }]
3511
3512 def _real_extract(self, url):
3513 user_id = self._match_id(url)
3514 return self.url_result(
3515 'https://www.youtube.com/user/%s' % user_id,
3516 ie=YoutubeTabIE.ie_key(), video_id=user_id)
3517
3518
3519 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
3520 IE_NAME = 'youtube:favorites'
3521 IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)'
3522 _VALID_URL = r':ytfav(?:ou?rite)?s?'
3523 _LOGIN_REQUIRED = True
3524 _TESTS = [{
3525 'url': ':ytfav',
3526 'only_matching': True,
3527 }, {
3528 'url': ':ytfavorites',
3529 'only_matching': True,
3530 }]
3531
3532 def _real_extract(self, url):
3533 return self.url_result(
3534 'https://www.youtube.com/playlist?list=LL',
3535 ie=YoutubeTabIE.ie_key())
3536
3537
3538 class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):
3539 IE_DESC = 'YouTube.com searches'
3540 # there doesn't appear to be a real limit, for example if you search for
3541 # 'python' you get more than 8.000.000 results
3542 _MAX_RESULTS = float('inf')
3543 IE_NAME = 'youtube:search'
3544 _SEARCH_KEY = 'ytsearch'
3545 _SEARCH_PARAMS = None
3546 _TESTS = []
3547
3548 def _entries(self, query, n):
3549 data = {
3550 'context': {
3551 'client': {
3552 'clientName': 'WEB',
3553 'clientVersion': '2.20201021.03.00',
3554 }
3555 },
3556 'query': query,
3557 }
3558 if self._SEARCH_PARAMS:
3559 data['params'] = self._SEARCH_PARAMS
3560 total = 0
3561 for page_num in itertools.count(1):
3562 search = self._download_json(
3563 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
3564 video_id='query "%s"' % query,
3565 note='Downloading page %s' % page_num,
3566 errnote='Unable to download API page', fatal=False,
3567 data=json.dumps(data).encode('utf8'),
3568 headers={'content-type': 'application/json'})
3569 if not search:
3570 break
3571 slr_contents = try_get(
3572 search,
3573 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
3574 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
3575 list)
3576 if not slr_contents:
3577 break
3578
3579 isr_contents = []
3580 continuation_token = None
3581 # Youtube sometimes adds promoted content to searches,
3582 # changing the index location of videos and token.
3583 # So we search through all entries till we find them.
3584 for index, isr in enumerate(slr_contents):
3585 if not isr_contents:
3586 isr_contents = try_get(
3587 slr_contents,
3588 (lambda x: x[index]['itemSectionRenderer']['contents']),
3589 list)
3590 for content in isr_contents:
3591 if content.get('videoRenderer') is not None:
3592 break
3593 else:
3594 isr_contents = []
3595
3596 if continuation_token is None:
3597 continuation_token = try_get(
3598 slr_contents,
3599 lambda x: x[index]['continuationItemRenderer']['continuationEndpoint']['continuationCommand'][
3600 'token'],
3601 compat_str)
3602 if continuation_token is not None and isr_contents:
3603 break
3604
3605 if not isr_contents:
3606 break
3607 for content in isr_contents:
3608 if not isinstance(content, dict):
3609 continue
3610 video = content.get('videoRenderer')
3611 if not isinstance(video, dict):
3612 continue
3613 video_id = video.get('videoId')
3614 if not video_id:
3615 continue
3616 title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str)
3617 description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str)
3618 duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str))
3619 view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
3620 view_count = int_or_none(self._search_regex(
3621 r'^(\d+)', re.sub(r'\s', '', view_count_text),
3622 'view count', default=None))
3623 uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
3624 total += 1
3625 yield {
3626 '_type': 'url_transparent',
3627 'ie_key': YoutubeIE.ie_key(),
3628 'id': video_id,
3629 'url': video_id,
3630 'title': title,
3631 'description': description,
3632 'duration': duration,
3633 'view_count': view_count,
3634 'uploader': uploader,
3635 }
3636 if total == n:
3637 return
3638 if not continuation_token:
3639 break
3640 data['continuation'] = continuation_token
3641
3642 def _get_n_results(self, query, n):
3643 """Get a specified number of results for a query"""
3644 return self.playlist_result(self._entries(query, n), query)
3645
3646
3647 class YoutubeSearchDateIE(YoutubeSearchIE):
3648 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
3649 _SEARCH_KEY = 'ytsearchdate'
3650 IE_DESC = 'YouTube.com searches, newest videos first'
3651 _SEARCH_PARAMS = 'CAI%3D'
3652
3653
3654 class YoutubeSearchURLIE(YoutubeSearchIE):
3655 IE_DESC = 'YouTube.com search URLs'
3656 IE_NAME = YoutubeSearchIE.IE_NAME + '_url'
3657 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)'
3658 # _MAX_RESULTS = 100
3659 _TESTS = [{
3660 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
3661 'playlist_mincount': 5,
3662 'info_dict': {
3663 'title': 'youtube-dl test video',
3664 }
3665 }, {
3666 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
3667 'only_matching': True,
3668 }]
3669
3670 @classmethod
3671 def _make_valid_url(cls):
3672 return cls._VALID_URL
3673
3674 def _real_extract(self, url):
3675 qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
3676 query = (qs.get('search_query') or qs.get('q'))[0]
3677 self._SEARCH_PARAMS = qs.get('sp', ('',))[0]
3678 return self._get_n_results(query, self._MAX_RESULTS)
3679
3680
3681 class YoutubeFeedsInfoExtractor(YoutubeTabIE):
3682 """
3683 Base class for feed extractors
3684 Subclasses must define the _FEED_NAME property.
3685 """
3686 _LOGIN_REQUIRED = True
3687 # _MAX_PAGES = 5
3688 _TESTS = []
3689
3690 @property
3691 def IE_NAME(self):
3692 return 'youtube:%s' % self._FEED_NAME
3693
3694 def _real_initialize(self):
3695 self._login()
3696
3697 def _real_extract(self, url):
3698 return self.url_result(
3699 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
3700 ie=YoutubeTabIE.ie_key())
3701
3702
3703 class YoutubeWatchLaterIE(InfoExtractor):
3704 IE_NAME = 'youtube:watchlater'
3705 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
3706 _VALID_URL = r':ytwatchlater'
3707 _TESTS = [{
3708 'url': ':ytwatchlater',
3709 'only_matching': True,
3710 }]
3711
3712 def _real_extract(self, url):
3713 return self.url_result(
3714 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key())
3715
3716
3717 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3718 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
3719 _VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?'
3720 _FEED_NAME = 'recommended'
3721 _TESTS = [{
3722 'url': ':ytrec',
3723 'only_matching': True,
3724 }, {
3725 'url': ':ytrecommended',
3726 'only_matching': True,
3727 }, {
3728 'url': 'https://youtube.com',
3729 'only_matching': True,
3730 }]
3731
3732
3733 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
3734 IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)'
3735 _VALID_URL = r':ytsub(?:scription)?s?'
3736 _FEED_NAME = 'subscriptions'
3737 _TESTS = [{
3738 'url': ':ytsubs',
3739 'only_matching': True,
3740 }, {
3741 'url': ':ytsubscriptions',
3742 'only_matching': True,
3743 }]
3744
3745
3746 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
3747 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
3748 _VALID_URL = r':ythistory'
3749 _FEED_NAME = 'history'
3750 _TESTS = [{
3751 'url': ':ythistory',
3752 'only_matching': True,
3753 }]
3754
3755
3756 class YoutubeTruncatedURLIE(InfoExtractor):
3757 IE_NAME = 'youtube:truncated_url'
3758 IE_DESC = False # Do not list
3759 _VALID_URL = r'''(?x)
3760 (?:https?://)?
3761 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3762 (?:watch\?(?:
3763 feature=[a-z_]+|
3764 annotation_id=annotation_[^&]+|
3765 x-yt-cl=[0-9]+|
3766 hl=[^&]*|
3767 t=[0-9]+
3768 )?
3769 |
3770 attribution_link\?a=[^&]+
3771 )
3772 $
3773 '''
3774
3775 _TESTS = [{
3776 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
3777 'only_matching': True,
3778 }, {
3779 'url': 'https://www.youtube.com/watch?',
3780 'only_matching': True,
3781 }, {
3782 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3783 'only_matching': True,
3784 }, {
3785 'url': 'https://www.youtube.com/watch?feature=foo',
3786 'only_matching': True,
3787 }, {
3788 'url': 'https://www.youtube.com/watch?hl=en-GB',
3789 'only_matching': True,
3790 }, {
3791 'url': 'https://www.youtube.com/watch?t=2372',
3792 'only_matching': True,
3793 }]
3794
3795 def _real_extract(self, url):
3796 raise ExtractorError(
3797 'Did you forget to quote the URL? Remember that & is a meta '
3798 'character in most shells, so you want to put the URL in quotes, '
3799 'like youtube-dl '
3800 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3801 ' or simply youtube-dl BaW_jenozKc .',
3802 expected=True)
3803
3804
3805 class YoutubeTruncatedIDIE(InfoExtractor):
3806 IE_NAME = 'youtube:truncated_id'
3807 IE_DESC = False # Do not list
3808 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
3809
3810 _TESTS = [{
3811 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3812 'only_matching': True,
3813 }]
3814
3815 def _real_extract(self, url):
3816 video_id = self._match_id(url)
3817 raise ExtractorError(
3818 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3819 expected=True)
3820
3821
3822 # Do Youtube show urls even exist anymore? I couldn't find any
3823 r'''
3824 class YoutubeShowIE(YoutubeTabIE):
3825 IE_DESC = 'YouTube.com (multi-season) shows'
3826 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
3827 IE_NAME = 'youtube:show'
3828 _TESTS = [{
3829 'url': 'https://www.youtube.com/show/airdisasters',
3830 'playlist_mincount': 5,
3831 'info_dict': {
3832 'id': 'airdisasters',
3833 'title': 'Air Disasters',
3834 }
3835 }]
3836
3837 def _real_extract(self, url):
3838 playlist_id = self._match_id(url)
3839 return super(YoutubeShowIE, self)._real_extract(
3840 'https://www.youtube.com/show/%s/playlists' % playlist_id)
3841 '''