]> jfr.im git - yt-dlp.git/blob - youtube_dlc/extractor/youtube.py
Revert changes to vlive
[yt-dlp.git] / youtube_dlc / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5
6 import itertools
7 import json
8 import os.path
9 import random
10 import re
11 import time
12 import traceback
13
14 from .common import InfoExtractor, SearchInfoExtractor
15 from ..jsinterp import JSInterpreter
16 from ..swfinterp import SWFInterpreter
17 from ..compat import (
18 compat_chr,
19 compat_kwargs,
20 compat_parse_qs,
21 compat_urllib_parse_unquote,
22 compat_urllib_parse_unquote_plus,
23 compat_urllib_parse_urlencode,
24 compat_urllib_parse_urlparse,
25 compat_urlparse,
26 compat_str,
27 )
28 from ..utils import (
29 bool_or_none,
30 clean_html,
31 error_to_compat_str,
32 ExtractorError,
33 float_or_none,
34 get_element_by_id,
35 int_or_none,
36 mimetype2ext,
37 orderedSet,
38 parse_codecs,
39 parse_count,
40 parse_duration,
41 remove_quotes,
42 remove_start,
43 smuggle_url,
44 str_or_none,
45 str_to_int,
46 try_get,
47 unescapeHTML,
48 unified_strdate,
49 unsmuggle_url,
50 update_url_query,
51 uppercase_escape,
52 url_or_none,
53 urlencode_postdata,
54 urljoin,
55 )
56
57
58 class YoutubeBaseInfoExtractor(InfoExtractor):
59 """Provide base functions for Youtube extractors"""
60 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
61 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
62
63 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
64 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
65 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
66
67 _NETRC_MACHINE = 'youtube'
68 # If True it will raise an error if no login info is provided
69 _LOGIN_REQUIRED = False
70
71 _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM)'
72
73 _YOUTUBE_CLIENT_HEADERS = {
74 'x-youtube-client-name': '1',
75 'x-youtube-client-version': '1.20200609.04.02',
76 }
77
78 def _set_language(self):
79 self._set_cookie(
80 '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
81 # YouTube sets the expire time to about two months
82 expire_time=time.time() + 2 * 30 * 24 * 3600)
83
84 def _ids_to_results(self, ids):
85 return [
86 self.url_result(vid_id, 'Youtube', video_id=vid_id)
87 for vid_id in ids]
88
89 def _login(self):
90 """
91 Attempt to log in to YouTube.
92 True is returned if successful or skipped.
93 False is returned if login failed.
94
95 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
96 """
97 username, password = self._get_login_info()
98 # No authentication to be performed
99 if username is None:
100 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
101 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
102 if self._downloader.params.get('cookiefile') and False: # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
103 self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
104 return True
105
106 login_page = self._download_webpage(
107 self._LOGIN_URL, None,
108 note='Downloading login page',
109 errnote='unable to fetch login page', fatal=False)
110 if login_page is False:
111 return
112
113 login_form = self._hidden_inputs(login_page)
114
115 def req(url, f_req, note, errnote):
116 data = login_form.copy()
117 data.update({
118 'pstMsg': 1,
119 'checkConnection': 'youtube',
120 'checkedDomains': 'youtube',
121 'hl': 'en',
122 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
123 'f.req': json.dumps(f_req),
124 'flowName': 'GlifWebSignIn',
125 'flowEntry': 'ServiceLogin',
126 # TODO: reverse actual botguard identifier generation algo
127 'bgRequest': '["identifier",""]',
128 })
129 return self._download_json(
130 url, None, note=note, errnote=errnote,
131 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
132 fatal=False,
133 data=urlencode_postdata(data), headers={
134 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
135 'Google-Accounts-XSRF': 1,
136 })
137
138 def warn(message):
139 self._downloader.report_warning(message)
140
141 lookup_req = [
142 username,
143 None, [], None, 'US', None, None, 2, False, True,
144 [
145 None, None,
146 [2, 1, None, 1,
147 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
148 None, [], 4],
149 1, [None, None, []], None, None, None, True
150 ],
151 username,
152 ]
153
154 lookup_results = req(
155 self._LOOKUP_URL, lookup_req,
156 'Looking up account info', 'Unable to look up account info')
157
158 if lookup_results is False:
159 return False
160
161 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
162 if not user_hash:
163 warn('Unable to extract user hash')
164 return False
165
166 challenge_req = [
167 user_hash,
168 None, 1, None, [1, None, None, None, [password, None, True]],
169 [
170 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
171 1, [None, None, []], None, None, None, True
172 ]]
173
174 challenge_results = req(
175 self._CHALLENGE_URL, challenge_req,
176 'Logging in', 'Unable to log in')
177
178 if challenge_results is False:
179 return
180
181 login_res = try_get(challenge_results, lambda x: x[0][5], list)
182 if login_res:
183 login_msg = try_get(login_res, lambda x: x[5], compat_str)
184 warn(
185 'Unable to login: %s' % 'Invalid password'
186 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
187 return False
188
189 res = try_get(challenge_results, lambda x: x[0][-1], list)
190 if not res:
191 warn('Unable to extract result entry')
192 return False
193
194 login_challenge = try_get(res, lambda x: x[0][0], list)
195 if login_challenge:
196 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
197 if challenge_str == 'TWO_STEP_VERIFICATION':
198 # SEND_SUCCESS - TFA code has been successfully sent to phone
199 # QUOTA_EXCEEDED - reached the limit of TFA codes
200 status = try_get(login_challenge, lambda x: x[5], compat_str)
201 if status == 'QUOTA_EXCEEDED':
202 warn('Exceeded the limit of TFA codes, try later')
203 return False
204
205 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
206 if not tl:
207 warn('Unable to extract TL')
208 return False
209
210 tfa_code = self._get_tfa_info('2-step verification code')
211
212 if not tfa_code:
213 warn(
214 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
215 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
216 return False
217
218 tfa_code = remove_start(tfa_code, 'G-')
219
220 tfa_req = [
221 user_hash, None, 2, None,
222 [
223 9, None, None, None, None, None, None, None,
224 [None, tfa_code, True, 2]
225 ]]
226
227 tfa_results = req(
228 self._TFA_URL.format(tl), tfa_req,
229 'Submitting TFA code', 'Unable to submit TFA code')
230
231 if tfa_results is False:
232 return False
233
234 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
235 if tfa_res:
236 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
237 warn(
238 'Unable to finish TFA: %s' % 'Invalid TFA code'
239 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
240 return False
241
242 check_cookie_url = try_get(
243 tfa_results, lambda x: x[0][-1][2], compat_str)
244 else:
245 CHALLENGES = {
246 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
247 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
248 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
249 }
250 challenge = CHALLENGES.get(
251 challenge_str,
252 '%s returned error %s.' % (self.IE_NAME, challenge_str))
253 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
254 return False
255 else:
256 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
257
258 if not check_cookie_url:
259 warn('Unable to extract CheckCookie URL')
260 return False
261
262 check_cookie_results = self._download_webpage(
263 check_cookie_url, None, 'Checking cookie', fatal=False)
264
265 if check_cookie_results is False:
266 return False
267
268 if 'https://myaccount.google.com/' not in check_cookie_results:
269 warn('Unable to log in')
270 return False
271
272 return True
273
274 def _download_webpage_handle(self, *args, **kwargs):
275 query = kwargs.get('query', {}).copy()
276 kwargs['query'] = query
277 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
278 *args, **compat_kwargs(kwargs))
279
280 def _get_yt_initial_data(self, video_id, webpage):
281 config = self._search_regex(
282 (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
283 r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
284 webpage, 'ytInitialData', default=None)
285 if config:
286 return self._parse_json(
287 uppercase_escape(config), video_id, fatal=False)
288
289 def _real_initialize(self):
290 if self._downloader is None:
291 return
292 self._set_language()
293 if not self._login():
294 return
295
296 _DEFAULT_API_DATA = {
297 'context': {
298 'client': {
299 'clientName': 'WEB',
300 'clientVersion': '2.20201021.03.00',
301 }
302 },
303 }
304
305 def _call_api(self, ep, query, video_id):
306 data = self._DEFAULT_API_DATA.copy()
307 data.update(query)
308
309 response = self._download_json(
310 'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id,
311 note='Downloading API JSON', errnote='Unable to download API page',
312 data=json.dumps(data).encode('utf8'),
313 headers={'content-type': 'application/json'},
314 query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'})
315
316 return response
317
318 def _extract_yt_initial_data(self, video_id, webpage):
319 return self._parse_json(
320 self._search_regex(
321 r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;',
322 webpage, 'yt initial data'),
323 video_id)
324
325
326 class YoutubeIE(YoutubeBaseInfoExtractor):
327 IE_DESC = 'YouTube.com'
328 _VALID_URL = r"""(?x)^
329 (
330 (?:https?://|//) # http(s):// or protocol-independent URL
331 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
332 (?:www\.)?deturl\.com/www\.youtube\.com/|
333 (?:www\.)?pwnyoutube\.com/|
334 (?:www\.)?hooktube\.com/|
335 (?:www\.)?yourepeat\.com/|
336 tube\.majestyc\.net/|
337 # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
338 (?:(?:www|dev)\.)?invidio\.us/|
339 (?:(?:www|no)\.)?invidiou\.sh/|
340 (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/|
341 (?:www\.)?invidious\.kabi\.tk/|
342 (?:www\.)?invidious\.13ad\.de/|
343 (?:www\.)?invidious\.mastodon\.host/|
344 (?:www\.)?invidious\.nixnet\.xyz/|
345 (?:www\.)?invidious\.drycat\.fr/|
346 (?:www\.)?tube\.poal\.co/|
347 (?:www\.)?vid\.wxzm\.sx/|
348 (?:www\.)?yewtu\.be/|
349 (?:www\.)?yt\.elukerio\.org/|
350 (?:www\.)?yt\.lelux\.fi/|
351 (?:www\.)?invidious\.ggc-project\.de/|
352 (?:www\.)?yt\.maisputain\.ovh/|
353 (?:www\.)?invidious\.13ad\.de/|
354 (?:www\.)?invidious\.toot\.koeln/|
355 (?:www\.)?invidious\.fdn\.fr/|
356 (?:www\.)?watch\.nettohikari\.com/|
357 (?:www\.)?kgg2m7yk5aybusll\.onion/|
358 (?:www\.)?qklhadlycap4cnod\.onion/|
359 (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
360 (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
361 (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
362 (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
363 (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
364 (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
365 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
366 (?:.*?\#/)? # handle anchor (#/) redirect urls
367 (?: # the various things that can precede the ID:
368 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
369 |(?: # or the v= param in all its forms
370 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
371 (?:\?|\#!?) # the params delimiter ? or # or #!
372 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
373 v=
374 )
375 ))
376 |(?:
377 youtu\.be| # just youtu.be/xxxx
378 vid\.plus| # or vid.plus/xxxx
379 zwearz\.com/watch| # or zwearz.com/watch/xxxx
380 )/
381 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
382 )
383 )? # all until now is optional -> you can pass the naked ID
384 (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
385 (?!.*?\blist=
386 (?:
387 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
388 WL # WL are handled by the watch later IE
389 )
390 )
391 (?(1).+)? # if we found the ID, everything can follow
392 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
393 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
394 _PLAYER_INFO_RE = (
395 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P<ext>[a-z]+)$',
396 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.(?P<ext>[a-z]+)$',
397 )
398 _formats = {
399 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
400 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
401 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
402 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
403 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
404 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
405 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
406 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
407 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
408 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
409 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
410 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
411 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
412 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
413 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
414 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
415 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
416 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
417
418
419 # 3D videos
420 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
421 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
422 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
423 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
424 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
425 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
426 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
427
428 # Apple HTTP Live Streaming
429 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
430 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
431 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
432 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
433 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
434 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
435 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
436 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
437
438 # DASH mp4 video
439 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
440 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
441 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
442 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
443 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
444 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
445 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
446 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
447 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
448 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
449 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
450 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
451
452 # Dash mp4 audio
453 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
454 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
455 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
456 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
457 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
458 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
459 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
460
461 # Dash webm
462 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
463 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
464 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
465 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
466 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
467 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
468 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
469 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
470 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
471 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
472 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
473 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
474 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
475 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
476 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
477 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
478 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
479 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
480 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
481 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
482 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
483 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
484
485 # Dash webm audio
486 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
487 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
488
489 # Dash webm audio with opus inside
490 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
491 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
492 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
493
494 # RTMP (unnamed)
495 '_rtmp': {'protocol': 'rtmp'},
496
497 # av01 video only formats sometimes served with "unknown" codecs
498 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
499 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
500 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
501 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
502 }
503 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
504
505 _GEO_BYPASS = False
506
507 IE_NAME = 'youtube'
508 _TESTS = [
509 {
510 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
511 'info_dict': {
512 'id': 'BaW_jenozKc',
513 'ext': 'mp4',
514 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
515 'uploader': 'Philipp Hagemeister',
516 'uploader_id': 'phihag',
517 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
518 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
519 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
520 'upload_date': '20121002',
521 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
522 'categories': ['Science & Technology'],
523 'tags': ['youtube-dl'],
524 'duration': 10,
525 'view_count': int,
526 'like_count': int,
527 'dislike_count': int,
528 'start_time': 1,
529 'end_time': 9,
530 }
531 },
532 {
533 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
534 'note': 'Embed-only video (#1746)',
535 'info_dict': {
536 'id': 'yZIXLfi8CZQ',
537 'ext': 'mp4',
538 'upload_date': '20120608',
539 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
540 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
541 'uploader': 'SET India',
542 'uploader_id': 'setindia',
543 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
544 'age_limit': 18,
545 }
546 },
547 {
548 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ',
549 'note': 'Use the first video ID in the URL',
550 'info_dict': {
551 'id': 'BaW_jenozKc',
552 'ext': 'mp4',
553 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
554 'uploader': 'Philipp Hagemeister',
555 'uploader_id': 'phihag',
556 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
557 'upload_date': '20121002',
558 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
559 'categories': ['Science & Technology'],
560 'tags': ['youtube-dl'],
561 'duration': 10,
562 'view_count': int,
563 'like_count': int,
564 'dislike_count': int,
565 },
566 'params': {
567 'skip_download': True,
568 },
569 },
570 {
571 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
572 'note': '256k DASH audio (format 141) via DASH manifest',
573 'info_dict': {
574 'id': 'a9LDPn-MO4I',
575 'ext': 'm4a',
576 'upload_date': '20121002',
577 'uploader_id': '8KVIDEO',
578 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
579 'description': '',
580 'uploader': '8KVIDEO',
581 'title': 'UHDTV TEST 8K VIDEO.mp4'
582 },
583 'params': {
584 'youtube_include_dash_manifest': True,
585 'format': '141',
586 },
587 'skip': 'format 141 not served anymore',
588 },
589 # DASH manifest with encrypted signature
590 {
591 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
592 'info_dict': {
593 'id': 'IB3lcPjvWLA',
594 'ext': 'm4a',
595 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
596 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
597 'duration': 244,
598 'uploader': 'AfrojackVEVO',
599 'uploader_id': 'AfrojackVEVO',
600 'upload_date': '20131011',
601 },
602 'params': {
603 'youtube_include_dash_manifest': True,
604 'format': '141/bestaudio[ext=m4a]',
605 },
606 },
607 # Controversy video
608 {
609 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
610 'info_dict': {
611 'id': 'T4XJQO3qol8',
612 'ext': 'mp4',
613 'duration': 219,
614 'upload_date': '20100909',
615 'uploader': 'Amazing Atheist',
616 'uploader_id': 'TheAmazingAtheist',
617 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
618 'title': 'Burning Everyone\'s Koran',
619 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
620 }
621 },
622 # Normal age-gate video (embed allowed)
623 {
624 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
625 'info_dict': {
626 'id': 'HtVdAasjOgU',
627 'ext': 'mp4',
628 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
629 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
630 'duration': 142,
631 'uploader': 'The Witcher',
632 'uploader_id': 'WitcherGame',
633 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
634 'upload_date': '20140605',
635 'age_limit': 18,
636 },
637 },
638 # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
639 # YouTube Red ad is not captured for creator
640 {
641 'url': '__2ABJjxzNo',
642 'info_dict': {
643 'id': '__2ABJjxzNo',
644 'ext': 'mp4',
645 'duration': 266,
646 'upload_date': '20100430',
647 'uploader_id': 'deadmau5',
648 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
649 'creator': 'Dada Life, deadmau5',
650 'description': 'md5:12c56784b8032162bb936a5f76d55360',
651 'uploader': 'deadmau5',
652 'title': 'Deadmau5 - Some Chords (HD)',
653 'alt_title': 'This Machine Kills Some Chords',
654 },
655 'expected_warnings': [
656 'DASH manifest missing',
657 ]
658 },
659 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
660 {
661 'url': 'lqQg6PlCWgI',
662 'info_dict': {
663 'id': 'lqQg6PlCWgI',
664 'ext': 'mp4',
665 'duration': 6085,
666 'upload_date': '20150827',
667 'uploader_id': 'olympic',
668 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
669 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
670 'uploader': 'Olympic',
671 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
672 },
673 'params': {
674 'skip_download': 'requires avconv',
675 }
676 },
677 # Non-square pixels
678 {
679 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
680 'info_dict': {
681 'id': '_b-2C3KPAM0',
682 'ext': 'mp4',
683 'stretched_ratio': 16 / 9.,
684 'duration': 85,
685 'upload_date': '20110310',
686 'uploader_id': 'AllenMeow',
687 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
688 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
689 'uploader': '孫ᄋᄅ',
690 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
691 },
692 },
693 # url_encoded_fmt_stream_map is empty string
694 {
695 'url': 'qEJwOuvDf7I',
696 'info_dict': {
697 'id': 'qEJwOuvDf7I',
698 'ext': 'webm',
699 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
700 'description': '',
701 'upload_date': '20150404',
702 'uploader_id': 'spbelect',
703 'uploader': 'Наблюдатели Петербурга',
704 },
705 'params': {
706 'skip_download': 'requires avconv',
707 },
708 'skip': 'This live event has ended.',
709 },
710 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
711 {
712 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
713 'info_dict': {
714 'id': 'FIl7x6_3R5Y',
715 'ext': 'webm',
716 'title': 'md5:7b81415841e02ecd4313668cde88737a',
717 'description': 'md5:116377fd2963b81ec4ce64b542173306',
718 'duration': 220,
719 'upload_date': '20150625',
720 'uploader_id': 'dorappi2000',
721 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
722 'uploader': 'dorappi2000',
723 'formats': 'mincount:31',
724 },
725 'skip': 'not actual anymore',
726 },
727 # DASH manifest with segment_list
728 {
729 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
730 'md5': '8ce563a1d667b599d21064e982ab9e31',
731 'info_dict': {
732 'id': 'CsmdDsKjzN8',
733 'ext': 'mp4',
734 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
735 'uploader': 'Airtek',
736 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
737 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
738 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
739 },
740 'params': {
741 'youtube_include_dash_manifest': True,
742 'format': '135', # bestvideo
743 },
744 'skip': 'This live event has ended.',
745 },
746 {
747 # Multifeed videos (multiple cameras), URL is for Main Camera
748 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
749 'info_dict': {
750 'id': 'jqWvoWXjCVs',
751 'title': 'teamPGP: Rocket League Noob Stream',
752 'description': 'md5:dc7872fb300e143831327f1bae3af010',
753 },
754 'playlist': [{
755 'info_dict': {
756 'id': 'jqWvoWXjCVs',
757 'ext': 'mp4',
758 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
759 'description': 'md5:dc7872fb300e143831327f1bae3af010',
760 'duration': 7335,
761 'upload_date': '20150721',
762 'uploader': 'Beer Games Beer',
763 'uploader_id': 'beergamesbeer',
764 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
765 'license': 'Standard YouTube License',
766 },
767 }, {
768 'info_dict': {
769 'id': '6h8e8xoXJzg',
770 'ext': 'mp4',
771 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
772 'description': 'md5:dc7872fb300e143831327f1bae3af010',
773 'duration': 7337,
774 'upload_date': '20150721',
775 'uploader': 'Beer Games Beer',
776 'uploader_id': 'beergamesbeer',
777 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
778 'license': 'Standard YouTube License',
779 },
780 }, {
781 'info_dict': {
782 'id': 'PUOgX5z9xZw',
783 'ext': 'mp4',
784 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
785 'description': 'md5:dc7872fb300e143831327f1bae3af010',
786 'duration': 7337,
787 'upload_date': '20150721',
788 'uploader': 'Beer Games Beer',
789 'uploader_id': 'beergamesbeer',
790 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
791 'license': 'Standard YouTube License',
792 },
793 }, {
794 'info_dict': {
795 'id': 'teuwxikvS5k',
796 'ext': 'mp4',
797 'title': 'teamPGP: Rocket League Noob Stream (zim)',
798 'description': 'md5:dc7872fb300e143831327f1bae3af010',
799 'duration': 7334,
800 'upload_date': '20150721',
801 'uploader': 'Beer Games Beer',
802 'uploader_id': 'beergamesbeer',
803 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
804 'license': 'Standard YouTube License',
805 },
806 }],
807 'params': {
808 'skip_download': True,
809 },
810 'skip': 'This video is not available.',
811 },
812 {
813 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
814 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
815 'info_dict': {
816 'id': 'gVfLd0zydlo',
817 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
818 },
819 'playlist_count': 2,
820 'skip': 'Not multifeed anymore',
821 },
822 {
823 'url': 'https://vid.plus/FlRa-iH7PGw',
824 'only_matching': True,
825 },
826 {
827 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
828 'only_matching': True,
829 },
830 {
831 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
832 # Also tests cut-off URL expansion in video description (see
833 # https://github.com/ytdl-org/youtube-dl/issues/1892,
834 # https://github.com/ytdl-org/youtube-dl/issues/8164)
835 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
836 'info_dict': {
837 'id': 'lsguqyKfVQg',
838 'ext': 'mp4',
839 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
840 'alt_title': 'Dark Walk - Position Music',
841 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
842 'duration': 133,
843 'upload_date': '20151119',
844 'uploader_id': 'IronSoulElf',
845 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
846 'uploader': 'IronSoulElf',
847 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
848 'track': 'Dark Walk - Position Music',
849 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
850 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
851 },
852 'params': {
853 'skip_download': True,
854 },
855 },
856 {
857 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
858 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
859 'only_matching': True,
860 },
861 {
862 # Video with yt:stretch=17:0
863 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
864 'info_dict': {
865 'id': 'Q39EVAstoRM',
866 'ext': 'mp4',
867 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
868 'description': 'md5:ee18a25c350637c8faff806845bddee9',
869 'upload_date': '20151107',
870 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
871 'uploader': 'CH GAMER DROID',
872 },
873 'params': {
874 'skip_download': True,
875 },
876 'skip': 'This video does not exist.',
877 },
878 {
879 # Video licensed under Creative Commons
880 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
881 'info_dict': {
882 'id': 'M4gD1WSo5mA',
883 'ext': 'mp4',
884 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
885 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
886 'duration': 721,
887 'upload_date': '20150127',
888 'uploader_id': 'BerkmanCenter',
889 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
890 'uploader': 'The Berkman Klein Center for Internet & Society',
891 'license': 'Creative Commons Attribution license (reuse allowed)',
892 },
893 'params': {
894 'skip_download': True,
895 },
896 },
897 {
898 # Channel-like uploader_url
899 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
900 'info_dict': {
901 'id': 'eQcmzGIKrzg',
902 'ext': 'mp4',
903 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
904 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
905 'duration': 4060,
906 'upload_date': '20151119',
907 'uploader': 'Bernie Sanders',
908 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
909 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
910 'license': 'Creative Commons Attribution license (reuse allowed)',
911 },
912 'params': {
913 'skip_download': True,
914 },
915 },
916 {
917 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
918 'only_matching': True,
919 },
920 {
921 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
922 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
923 'only_matching': True,
924 },
925 {
926 # Rental video preview
927 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
928 'info_dict': {
929 'id': 'uGpuVWrhIzE',
930 'ext': 'mp4',
931 'title': 'Piku - Trailer',
932 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
933 'upload_date': '20150811',
934 'uploader': 'FlixMatrix',
935 'uploader_id': 'FlixMatrixKaravan',
936 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
937 'license': 'Standard YouTube License',
938 },
939 'params': {
940 'skip_download': True,
941 },
942 'skip': 'This video is not available.',
943 },
944 {
945 # YouTube Red video with episode data
946 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
947 'info_dict': {
948 'id': 'iqKdEhx-dD4',
949 'ext': 'mp4',
950 'title': 'Isolation - Mind Field (Ep 1)',
951 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
952 'duration': 2085,
953 'upload_date': '20170118',
954 'uploader': 'Vsauce',
955 'uploader_id': 'Vsauce',
956 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
957 'series': 'Mind Field',
958 'season_number': 1,
959 'episode_number': 1,
960 },
961 'params': {
962 'skip_download': True,
963 },
964 'expected_warnings': [
965 'Skipping DASH manifest',
966 ],
967 },
968 {
969 # The following content has been identified by the YouTube community
970 # as inappropriate or offensive to some audiences.
971 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
972 'info_dict': {
973 'id': '6SJNVb0GnPI',
974 'ext': 'mp4',
975 'title': 'Race Differences in Intelligence',
976 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
977 'duration': 965,
978 'upload_date': '20140124',
979 'uploader': 'New Century Foundation',
980 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
981 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
982 },
983 'params': {
984 'skip_download': True,
985 },
986 },
987 {
988 # itag 212
989 'url': '1t24XAntNCY',
990 'only_matching': True,
991 },
992 {
993 # geo restricted to JP
994 'url': 'sJL6WA-aGkQ',
995 'only_matching': True,
996 },
997 {
998 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
999 'only_matching': True,
1000 },
1001 {
1002 # DRM protected
1003 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1004 'only_matching': True,
1005 },
1006 {
1007 # Video with unsupported adaptive stream type formats
1008 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1009 'info_dict': {
1010 'id': 'Z4Vy8R84T1U',
1011 'ext': 'mp4',
1012 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1013 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1014 'duration': 433,
1015 'upload_date': '20130923',
1016 'uploader': 'Amelia Putri Harwita',
1017 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1018 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1019 'formats': 'maxcount:10',
1020 },
1021 'params': {
1022 'skip_download': True,
1023 'youtube_include_dash_manifest': False,
1024 },
1025 'skip': 'not actual anymore',
1026 },
1027 {
1028 # Youtube Music Auto-generated description
1029 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1030 'info_dict': {
1031 'id': 'MgNrAu2pzNs',
1032 'ext': 'mp4',
1033 'title': 'Voyeur Girl',
1034 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1035 'upload_date': '20190312',
1036 'uploader': 'Stephen - Topic',
1037 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
1038 'artist': 'Stephen',
1039 'track': 'Voyeur Girl',
1040 'album': 'it\'s too much love to know my dear',
1041 'release_date': '20190313',
1042 'release_year': 2019,
1043 },
1044 'params': {
1045 'skip_download': True,
1046 },
1047 },
1048 {
1049 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1050 'only_matching': True,
1051 },
1052 {
1053 # invalid -> valid video id redirection
1054 'url': 'DJztXj2GPfl',
1055 'info_dict': {
1056 'id': 'DJztXj2GPfk',
1057 'ext': 'mp4',
1058 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1059 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1060 'upload_date': '20090125',
1061 'uploader': 'Prochorowka',
1062 'uploader_id': 'Prochorowka',
1063 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1064 'artist': 'Panjabi MC',
1065 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1066 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1067 },
1068 'params': {
1069 'skip_download': True,
1070 },
1071 },
1072 {
1073 # empty description results in an empty string
1074 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1075 'info_dict': {
1076 'id': 'x41yOUIvK2k',
1077 'ext': 'mp4',
1078 'title': 'IMG 3456',
1079 'description': '',
1080 'upload_date': '20170613',
1081 'uploader_id': 'ElevageOrVert',
1082 'uploader': 'ElevageOrVert',
1083 },
1084 'params': {
1085 'skip_download': True,
1086 },
1087 },
1088 ]
1089
1090 def __init__(self, *args, **kwargs):
1091 super(YoutubeIE, self).__init__(*args, **kwargs)
1092 self._player_cache = {}
1093
1094 def report_video_info_webpage_download(self, video_id):
1095 """Report attempt to download video info webpage."""
1096 self.to_screen('%s: Downloading video info webpage' % video_id)
1097
1098 def report_information_extraction(self, video_id):
1099 """Report attempt to extract video information."""
1100 self.to_screen('%s: Extracting video information' % video_id)
1101
1102 def report_unavailable_format(self, video_id, format):
1103 """Report extracted video URL."""
1104 self.to_screen('%s: Format %s not available' % (video_id, format))
1105
1106 def report_rtmp_download(self):
1107 """Indicate the download will use the RTMP protocol."""
1108 self.to_screen('RTMP download detected')
1109
1110 def _signature_cache_id(self, example_sig):
1111 """ Return a string representation of a signature """
1112 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
1113
1114 @classmethod
1115 def _extract_player_info(cls, player_url):
1116 for player_re in cls._PLAYER_INFO_RE:
1117 id_m = re.search(player_re, player_url)
1118 if id_m:
1119 break
1120 else:
1121 raise ExtractorError('Cannot identify player %r' % player_url)
1122 return id_m.group('ext'), id_m.group('id')
1123
1124 def _extract_signature_function(self, video_id, player_url, example_sig):
1125 player_type, player_id = self._extract_player_info(player_url)
1126
1127 # Read from filesystem cache
1128 func_id = '%s_%s_%s' % (
1129 player_type, player_id, self._signature_cache_id(example_sig))
1130 assert os.path.basename(func_id) == func_id
1131
1132 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
1133 if cache_spec is not None:
1134 return lambda s: ''.join(s[i] for i in cache_spec)
1135
1136 download_note = (
1137 'Downloading player %s' % player_url
1138 if self._downloader.params.get('verbose') else
1139 'Downloading %s player %s' % (player_type, player_id)
1140 )
1141 if player_type == 'js':
1142 code = self._download_webpage(
1143 player_url, video_id,
1144 note=download_note,
1145 errnote='Download of %s failed' % player_url)
1146 res = self._parse_sig_js(code)
1147 elif player_type == 'swf':
1148 urlh = self._request_webpage(
1149 player_url, video_id,
1150 note=download_note,
1151 errnote='Download of %s failed' % player_url)
1152 code = urlh.read()
1153 res = self._parse_sig_swf(code)
1154 else:
1155 assert False, 'Invalid player type %r' % player_type
1156
1157 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1158 cache_res = res(test_string)
1159 cache_spec = [ord(c) for c in cache_res]
1160
1161 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
1162 return res
1163
1164 def _print_sig_code(self, func, example_sig):
1165 def gen_sig_code(idxs):
1166 def _genslice(start, end, step):
1167 starts = '' if start == 0 else str(start)
1168 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
1169 steps = '' if step == 1 else (':%d' % step)
1170 return 's[%s%s%s]' % (starts, ends, steps)
1171
1172 step = None
1173 # Quelch pyflakes warnings - start will be set when step is set
1174 start = '(Never used)'
1175 for i, prev in zip(idxs[1:], idxs[:-1]):
1176 if step is not None:
1177 if i - prev == step:
1178 continue
1179 yield _genslice(start, prev, step)
1180 step = None
1181 continue
1182 if i - prev in [-1, 1]:
1183 step = i - prev
1184 start = prev
1185 continue
1186 else:
1187 yield 's[%d]' % prev
1188 if step is None:
1189 yield 's[%d]' % i
1190 else:
1191 yield _genslice(start, i, step)
1192
1193 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1194 cache_res = func(test_string)
1195 cache_spec = [ord(c) for c in cache_res]
1196 expr_code = ' + '.join(gen_sig_code(cache_spec))
1197 signature_id_tuple = '(%s)' % (
1198 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
1199 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
1200 ' return %s\n') % (signature_id_tuple, expr_code)
1201 self.to_screen('Extracted signature function:\n' + code)
1202
1203 def _parse_sig_js(self, jscode):
1204 funcname = self._search_regex(
1205 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1206 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1207 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1208 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1209 # Obsolete patterns
1210 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1211 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
1212 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1213 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1214 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1215 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1216 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1217 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
1218 jscode, 'Initial JS player signature function name', group='sig')
1219
1220 jsi = JSInterpreter(jscode)
1221 initial_function = jsi.extract_function(funcname)
1222 return lambda s: initial_function([s])
1223
1224 def _parse_sig_swf(self, file_contents):
1225 swfi = SWFInterpreter(file_contents)
1226 TARGET_CLASSNAME = 'SignatureDecipher'
1227 searched_class = swfi.extract_class(TARGET_CLASSNAME)
1228 initial_function = swfi.extract_function(searched_class, 'decipher')
1229 return lambda s: initial_function([s])
1230
1231 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1232 """Turn the encrypted s field into a working signature"""
1233
1234 if player_url is None:
1235 raise ExtractorError('Cannot decrypt signature without player_url')
1236
1237 if player_url.startswith('//'):
1238 player_url = 'https:' + player_url
1239 elif not re.match(r'https?://', player_url):
1240 player_url = compat_urlparse.urljoin(
1241 'https://www.youtube.com', player_url)
1242 try:
1243 player_id = (player_url, self._signature_cache_id(s))
1244 if player_id not in self._player_cache:
1245 func = self._extract_signature_function(
1246 video_id, player_url, s
1247 )
1248 self._player_cache[player_id] = func
1249 func = self._player_cache[player_id]
1250 if self._downloader.params.get('youtube_print_sig_code'):
1251 self._print_sig_code(func, s)
1252 return func(s)
1253 except Exception as e:
1254 tb = traceback.format_exc()
1255 raise ExtractorError(
1256 'Signature extraction failed: ' + tb, cause=e)
1257
1258 def _get_subtitles(self, video_id, webpage, has_live_chat_replay):
1259 try:
1260 subs_doc = self._download_xml(
1261 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1262 video_id, note=False)
1263 except ExtractorError as err:
1264 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
1265 return {}
1266
1267 sub_lang_list = {}
1268 for track in subs_doc.findall('track'):
1269 lang = track.attrib['lang_code']
1270 if lang in sub_lang_list:
1271 continue
1272 sub_formats = []
1273 for ext in self._SUBTITLE_FORMATS:
1274 params = compat_urllib_parse_urlencode({
1275 'lang': lang,
1276 'v': video_id,
1277 'fmt': ext,
1278 'name': track.attrib['name'].encode('utf-8'),
1279 })
1280 sub_formats.append({
1281 'url': 'https://www.youtube.com/api/timedtext?' + params,
1282 'ext': ext,
1283 })
1284 sub_lang_list[lang] = sub_formats
1285 if has_live_chat_replay:
1286 sub_lang_list['live_chat'] = [
1287 {
1288 'video_id': video_id,
1289 'ext': 'json',
1290 'protocol': 'youtube_live_chat_replay',
1291 },
1292 ]
1293 if not sub_lang_list:
1294 self._downloader.report_warning('video doesn\'t have subtitles')
1295 return {}
1296 return sub_lang_list
1297
1298 def _get_ytplayer_config(self, video_id, webpage):
1299 patterns = (
1300 # User data may contain arbitrary character sequences that may affect
1301 # JSON extraction with regex, e.g. when '};' is contained the second
1302 # regex won't capture the whole JSON. Yet working around by trying more
1303 # concrete regex first keeping in mind proper quoted string handling
1304 # to be implemented in future that will replace this workaround (see
1305 # https://github.com/ytdl-org/youtube-dl/issues/7468,
1306 # https://github.com/ytdl-org/youtube-dl/pull/7599)
1307 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1308 r';ytplayer\.config\s*=\s*({.+?});',
1309 r'ytInitialPlayerResponse\s*=\s*({.+?});var meta' # Needed???
1310 )
1311 config = self._search_regex(
1312 patterns, webpage, 'ytplayer.config', default=None)
1313 if config:
1314 return self._parse_json(
1315 uppercase_escape(config), video_id, fatal=False)
1316
1317 def _get_music_metadata_from_yt_initial(self, yt_initial):
1318 music_metadata = []
1319 key_map = {
1320 'Album': 'album',
1321 'Artist': 'artist',
1322 'Song': 'track'
1323 }
1324 contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'])
1325 if type(contents) is list:
1326 for content in contents:
1327 music_track = {}
1328 if type(content) is not dict:
1329 continue
1330 videoSecondaryInfoRenderer = try_get(content, lambda x: x['videoSecondaryInfoRenderer'])
1331 if type(videoSecondaryInfoRenderer) is not dict:
1332 continue
1333 rows = try_get(videoSecondaryInfoRenderer, lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'])
1334 if type(rows) is not list:
1335 continue
1336 for row in rows:
1337 metadataRowRenderer = try_get(row, lambda x: x['metadataRowRenderer'])
1338 if type(metadataRowRenderer) is not dict:
1339 continue
1340 key = try_get(metadataRowRenderer, lambda x: x['title']['simpleText'])
1341 value = try_get(metadataRowRenderer, lambda x: x['contents'][0]['simpleText']) or \
1342 try_get(metadataRowRenderer, lambda x: x['contents'][0]['runs'][0]['text'])
1343 if type(key) is not str or type(value) is not str:
1344 continue
1345 if key in key_map:
1346 if key_map[key] in music_track:
1347 # we've started on a new track
1348 music_metadata.append(music_track)
1349 music_track = {}
1350 music_track[key_map[key]] = value
1351 if len(music_track.keys()):
1352 music_metadata.append(music_track)
1353 return music_metadata
1354
1355 def _get_automatic_captions(self, video_id, webpage):
1356 """We need the webpage for getting the captions url, pass it as an
1357 argument to speed up the process."""
1358 self.to_screen('%s: Looking for automatic captions' % video_id)
1359 player_config = self._get_ytplayer_config(video_id, webpage)
1360 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
1361 if not player_config:
1362 self._downloader.report_warning(err_msg)
1363 return {}
1364 try:
1365 args = player_config['args']
1366 caption_url = args.get('ttsurl')
1367 if caption_url:
1368 timestamp = args['timestamp']
1369 # We get the available subtitles
1370 list_params = compat_urllib_parse_urlencode({
1371 'type': 'list',
1372 'tlangs': 1,
1373 'asrs': 1,
1374 })
1375 list_url = caption_url + '&' + list_params
1376 caption_list = self._download_xml(list_url, video_id)
1377 original_lang_node = caption_list.find('track')
1378 if original_lang_node is None:
1379 self._downloader.report_warning('Video doesn\'t have automatic captions')
1380 return {}
1381 original_lang = original_lang_node.attrib['lang_code']
1382 caption_kind = original_lang_node.attrib.get('kind', '')
1383
1384 sub_lang_list = {}
1385 for lang_node in caption_list.findall('target'):
1386 sub_lang = lang_node.attrib['lang_code']
1387 sub_formats = []
1388 for ext in self._SUBTITLE_FORMATS:
1389 params = compat_urllib_parse_urlencode({
1390 'lang': original_lang,
1391 'tlang': sub_lang,
1392 'fmt': ext,
1393 'ts': timestamp,
1394 'kind': caption_kind,
1395 })
1396 sub_formats.append({
1397 'url': caption_url + '&' + params,
1398 'ext': ext,
1399 })
1400 sub_lang_list[sub_lang] = sub_formats
1401 return sub_lang_list
1402
1403 def make_captions(sub_url, sub_langs):
1404 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1405 caption_qs = compat_parse_qs(parsed_sub_url.query)
1406 captions = {}
1407 for sub_lang in sub_langs:
1408 sub_formats = []
1409 for ext in self._SUBTITLE_FORMATS:
1410 caption_qs.update({
1411 'tlang': [sub_lang],
1412 'fmt': [ext],
1413 })
1414 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1415 query=compat_urllib_parse_urlencode(caption_qs, True)))
1416 sub_formats.append({
1417 'url': sub_url,
1418 'ext': ext,
1419 })
1420 captions[sub_lang] = sub_formats
1421 return captions
1422
1423 # New captions format as of 22.06.2017
1424 player_response = args.get('player_response')
1425 if player_response and isinstance(player_response, compat_str):
1426 player_response = self._parse_json(
1427 player_response, video_id, fatal=False)
1428 if player_response:
1429 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1430 base_url = renderer['captionTracks'][0]['baseUrl']
1431 sub_lang_list = []
1432 for lang in renderer['translationLanguages']:
1433 lang_code = lang.get('languageCode')
1434 if lang_code:
1435 sub_lang_list.append(lang_code)
1436 return make_captions(base_url, sub_lang_list)
1437
1438 # Some videos don't provide ttsurl but rather caption_tracks and
1439 # caption_translation_languages (e.g. 20LmZk1hakA)
1440 # Does not used anymore as of 22.06.2017
1441 caption_tracks = args['caption_tracks']
1442 caption_translation_languages = args['caption_translation_languages']
1443 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
1444 sub_lang_list = []
1445 for lang in caption_translation_languages.split(','):
1446 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1447 sub_lang = lang_qs.get('lc', [None])[0]
1448 if sub_lang:
1449 sub_lang_list.append(sub_lang)
1450 return make_captions(caption_url, sub_lang_list)
1451 # An extractor error can be raise by the download process if there are
1452 # no automatic captions but there are subtitles
1453 except (KeyError, IndexError, ExtractorError):
1454 self._downloader.report_warning(err_msg)
1455 return {}
1456
1457 def _mark_watched(self, video_id, video_info, player_response):
1458 playback_url = url_or_none(try_get(
1459 player_response,
1460 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
1461 video_info, lambda x: x['videostats_playback_base_url'][0]))
1462 if not playback_url:
1463 return
1464 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1465 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1466
1467 # cpn generation algorithm is reverse engineered from base.js.
1468 # In fact it works even with dummy cpn.
1469 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1470 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1471
1472 qs.update({
1473 'ver': ['2'],
1474 'cpn': [cpn],
1475 })
1476 playback_url = compat_urlparse.urlunparse(
1477 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
1478
1479 self._download_webpage(
1480 playback_url, video_id, 'Marking watched',
1481 'Unable to mark watched', fatal=False)
1482
1483 @staticmethod
1484 def _extract_urls(webpage):
1485 # Embedded YouTube player
1486 entries = [
1487 unescapeHTML(mobj.group('url'))
1488 for mobj in re.finditer(r'''(?x)
1489 (?:
1490 <iframe[^>]+?src=|
1491 data-video-url=|
1492 <embed[^>]+?src=|
1493 embedSWF\(?:\s*|
1494 <object[^>]+data=|
1495 new\s+SWFObject\(
1496 )
1497 (["\'])
1498 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1499 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
1500 \1''', webpage)]
1501
1502 # lazyYT YouTube embed
1503 entries.extend(list(map(
1504 unescapeHTML,
1505 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1506
1507 # Wordpress "YouTube Video Importer" plugin
1508 matches = re.findall(r'''(?x)<div[^>]+
1509 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1510 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1511 entries.extend(m[-1] for m in matches)
1512
1513 return entries
1514
1515 @staticmethod
1516 def _extract_url(webpage):
1517 urls = YoutubeIE._extract_urls(webpage)
1518 return urls[0] if urls else None
1519
1520 @classmethod
1521 def extract_id(cls, url):
1522 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1523 if mobj is None:
1524 raise ExtractorError('Invalid URL: %s' % url)
1525 video_id = mobj.group(2)
1526 return video_id
1527
1528 def _extract_chapters_from_json(self, webpage, video_id, duration):
1529 if not webpage:
1530 return
1531 data = self._extract_yt_initial_data(video_id, webpage)
1532 if not data or not isinstance(data, dict):
1533 return
1534 chapters_list = try_get(
1535 data,
1536 lambda x: x['playerOverlays']
1537 ['playerOverlayRenderer']
1538 ['decoratedPlayerBarRenderer']
1539 ['decoratedPlayerBarRenderer']
1540 ['playerBar']
1541 ['chapteredPlayerBarRenderer']
1542 ['chapters'],
1543 list)
1544 if not chapters_list:
1545 return
1546
1547 def chapter_time(chapter):
1548 return float_or_none(
1549 try_get(
1550 chapter,
1551 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1552 int),
1553 scale=1000)
1554 chapters = []
1555 for next_num, chapter in enumerate(chapters_list, start=1):
1556 start_time = chapter_time(chapter)
1557 if start_time is None:
1558 continue
1559 end_time = (chapter_time(chapters_list[next_num])
1560 if next_num < len(chapters_list) else duration)
1561 if end_time is None:
1562 continue
1563 title = try_get(
1564 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1565 compat_str)
1566 chapters.append({
1567 'start_time': start_time,
1568 'end_time': end_time,
1569 'title': title,
1570 })
1571 return chapters
1572
1573 @staticmethod
1574 def _extract_chapters_from_description(description, duration):
1575 if not description:
1576 return None
1577 chapter_lines = re.findall(
1578 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1579 description)
1580 if not chapter_lines:
1581 return None
1582 chapters = []
1583 for next_num, (chapter_line, time_point) in enumerate(
1584 chapter_lines, start=1):
1585 start_time = parse_duration(time_point)
1586 if start_time is None:
1587 continue
1588 if start_time > duration:
1589 break
1590 end_time = (duration if next_num == len(chapter_lines)
1591 else parse_duration(chapter_lines[next_num][1]))
1592 if end_time is None:
1593 continue
1594 if end_time > duration:
1595 end_time = duration
1596 if start_time > end_time:
1597 break
1598 chapter_title = re.sub(
1599 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1600 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1601 chapters.append({
1602 'start_time': start_time,
1603 'end_time': end_time,
1604 'title': chapter_title,
1605 })
1606 return chapters
1607
1608 def _extract_chapters(self, webpage, description, video_id, duration):
1609 return (self._extract_chapters_from_json(webpage, video_id, duration)
1610 or self._extract_chapters_from_description(description, duration))
1611
1612 def _real_extract(self, url):
1613 url, smuggled_data = unsmuggle_url(url, {})
1614
1615 proto = (
1616 'http' if self._downloader.params.get('prefer_insecure', False)
1617 else 'https')
1618
1619 start_time = None
1620 end_time = None
1621 parsed_url = compat_urllib_parse_urlparse(url)
1622 for component in [parsed_url.fragment, parsed_url.query]:
1623 query = compat_parse_qs(component)
1624 if start_time is None and 't' in query:
1625 start_time = parse_duration(query['t'][0])
1626 if start_time is None and 'start' in query:
1627 start_time = parse_duration(query['start'][0])
1628 if end_time is None and 'end' in query:
1629 end_time = parse_duration(query['end'][0])
1630
1631 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1632 mobj = re.search(self._NEXT_URL_RE, url)
1633 if mobj:
1634 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
1635 video_id = self.extract_id(url)
1636
1637 # Get video webpage
1638 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
1639 video_webpage, urlh = self._download_webpage_handle(url, video_id)
1640
1641 qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query)
1642 video_id = qs.get('v', [None])[0] or video_id
1643
1644 # Attempt to extract SWF player URL
1645 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1646 if mobj is not None:
1647 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1648 else:
1649 player_url = None
1650
1651 dash_mpds = []
1652
1653 def add_dash_mpd(video_info):
1654 dash_mpd = video_info.get('dashmpd')
1655 if dash_mpd and dash_mpd[0] not in dash_mpds:
1656 dash_mpds.append(dash_mpd[0])
1657
1658 def add_dash_mpd_pr(pl_response):
1659 dash_mpd = url_or_none(try_get(
1660 pl_response, lambda x: x['streamingData']['dashManifestUrl'],
1661 compat_str))
1662 if dash_mpd and dash_mpd not in dash_mpds:
1663 dash_mpds.append(dash_mpd)
1664
1665 is_live = None
1666 view_count = None
1667
1668 def extract_view_count(v_info):
1669 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1670
1671 def extract_player_response(player_response, video_id):
1672 pl_response = str_or_none(player_response)
1673 if not pl_response:
1674 return
1675 pl_response = self._parse_json(pl_response, video_id, fatal=False)
1676 if isinstance(pl_response, dict):
1677 add_dash_mpd_pr(pl_response)
1678 return pl_response
1679
1680 def extract_embedded_config(embed_webpage, video_id):
1681 embedded_config = self._search_regex(
1682 r'setConfig\(({.*})\);',
1683 embed_webpage, 'ytInitialData', default=None)
1684 if embedded_config:
1685 return embedded_config
1686
1687 player_response = {}
1688
1689 # Get video info
1690 video_info = {}
1691 embed_webpage = None
1692 if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+'
1693 or re.search(r'player-age-gate-content">', video_webpage) is not None):
1694 cookie_keys = self._get_cookies('https://www.youtube.com').keys()
1695 age_gate = True
1696 # We simulate the access to the video from www.youtube.com/v/{video_id}
1697 # this can be viewed without login into Youtube
1698 url = proto + '://www.youtube.com/embed/%s' % video_id
1699 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
1700 ext = extract_embedded_config(embed_webpage, video_id)
1701 # playabilityStatus = re.search(r'{\\\"status\\\":\\\"(?P<playabilityStatus>[^\"]+)\\\"', ext)
1702 playable_in_embed = re.search(r'{\\\"playableInEmbed\\\":(?P<playableinEmbed>[^\,]+)', ext)
1703 if not playable_in_embed:
1704 self.to_screen('Could not determine whether playabale in embed for video %s' % video_id)
1705 playable_in_embed = ''
1706 else:
1707 playable_in_embed = playable_in_embed.group('playableinEmbed')
1708 # check if video is only playable on youtube in other words not playable in embed - if so it requires auth (cookies)
1709 # if re.search(r'player-unavailable">', embed_webpage) is not None:
1710 if playable_in_embed == 'false':
1711 '''
1712 # TODO apply this patch when Support for Python 2.6(!) and above drops
1713 if ({'VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID'} <= cookie_keys
1714 or {'VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO'} <= cookie_keys):
1715 '''
1716 if (set(('VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID')) <= set(cookie_keys)
1717 or set(('VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO')) <= set(cookie_keys)):
1718 age_gate = False
1719 # Try looking directly into the video webpage
1720 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1721 if ytplayer_config:
1722 args = ytplayer_config.get("args")
1723 if args is not None:
1724 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1725 # Convert to the same format returned by compat_parse_qs
1726 video_info = dict((k, [v]) for k, v in args.items())
1727 add_dash_mpd(video_info)
1728 # Rental video is not rented but preview is available (e.g.
1729 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1730 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1731 if not video_info and args.get('ypc_vid'):
1732 return self.url_result(
1733 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1734 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1735 is_live = True
1736 if not player_response:
1737 player_response = extract_player_response(args.get('player_response'), video_id)
1738 elif not player_response:
1739 player_response = ytplayer_config
1740 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1741 add_dash_mpd_pr(player_response)
1742 else:
1743 raise ExtractorError('Video is age restricted and only playable on Youtube. Requires cookies!', expected=True)
1744 else:
1745 data = compat_urllib_parse_urlencode({
1746 'video_id': video_id,
1747 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1748 'sts': self._search_regex(
1749 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1750 })
1751 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1752 try:
1753 video_info_webpage = self._download_webpage(
1754 video_info_url, video_id,
1755 note='Refetching age-gated info webpage',
1756 errnote='unable to download video info webpage')
1757 except ExtractorError:
1758 video_info_webpage = None
1759 if video_info_webpage:
1760 video_info = compat_parse_qs(video_info_webpage)
1761 pl_response = video_info.get('player_response', [None])[0]
1762 player_response = extract_player_response(pl_response, video_id)
1763 add_dash_mpd(video_info)
1764 view_count = extract_view_count(video_info)
1765 else:
1766 age_gate = False
1767 # Try looking directly into the video webpage
1768 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1769 if ytplayer_config:
1770 args = ytplayer_config.get('args', {})
1771 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1772 # Convert to the same format returned by compat_parse_qs
1773 video_info = dict((k, [v]) for k, v in args.items())
1774 add_dash_mpd(video_info)
1775 # Rental video is not rented but preview is available (e.g.
1776 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1777 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1778 if not video_info and args.get('ypc_vid'):
1779 return self.url_result(
1780 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1781 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1782 is_live = True
1783 if not player_response:
1784 player_response = extract_player_response(args.get('player_response'), video_id)
1785 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1786 add_dash_mpd_pr(player_response)
1787
1788 if not video_info and not player_response:
1789 player_response = extract_player_response(
1790 self._search_regex(
1791 r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;', video_webpage,
1792 'initial player response', default='{}'),
1793 video_id)
1794
1795 def extract_unavailable_message():
1796 messages = []
1797 for tag, kind in (('h1', 'message'), ('div', 'submessage')):
1798 msg = self._html_search_regex(
1799 r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
1800 video_webpage, 'unavailable %s' % kind, default=None)
1801 if msg:
1802 messages.append(msg)
1803 if messages:
1804 return '\n'.join(messages)
1805
1806 if not video_info and not player_response:
1807 unavailable_message = extract_unavailable_message()
1808 if not unavailable_message:
1809 unavailable_message = 'Unable to extract video data'
1810 raise ExtractorError(
1811 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
1812
1813 if not isinstance(video_info, dict):
1814 video_info = {}
1815
1816 video_details = try_get(
1817 player_response, lambda x: x['videoDetails'], dict) or {}
1818
1819 microformat = try_get(
1820 player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}
1821
1822 video_title = video_info.get('title', [None])[0] or video_details.get('title')
1823 if not video_title:
1824 self._downloader.report_warning('Unable to extract video title')
1825 video_title = '_'
1826
1827 description_original = video_description = get_element_by_id("eow-description", video_webpage)
1828 if video_description:
1829
1830 def replace_url(m):
1831 redir_url = compat_urlparse.urljoin(url, m.group(1))
1832 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1833 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1834 qs = compat_parse_qs(parsed_redir_url.query)
1835 q = qs.get('q')
1836 if q and q[0]:
1837 return q[0]
1838 return redir_url
1839
1840 description_original = video_description = re.sub(r'''(?x)
1841 <a\s+
1842 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1843 (?:title|href)="([^"]+)"\s+
1844 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1845 class="[^"]*"[^>]*>
1846 [^<]+\.{3}\s*
1847 </a>
1848 ''', replace_url, video_description)
1849 video_description = clean_html(video_description)
1850 else:
1851 video_description = video_details.get('shortDescription')
1852 if video_description is None:
1853 video_description = self._html_search_meta('description', video_webpage)
1854
1855 if not smuggled_data.get('force_singlefeed', False):
1856 if not self._downloader.params.get('noplaylist'):
1857 multifeed_metadata_list = try_get(
1858 player_response,
1859 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
1860 compat_str) or try_get(
1861 video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
1862 if multifeed_metadata_list:
1863 entries = []
1864 feed_ids = []
1865 for feed in multifeed_metadata_list.split(','):
1866 # Unquote should take place before split on comma (,) since textual
1867 # fields may contain comma as well (see
1868 # https://github.com/ytdl-org/youtube-dl/issues/8536)
1869 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
1870
1871 def feed_entry(name):
1872 return try_get(feed_data, lambda x: x[name][0], compat_str)
1873
1874 feed_id = feed_entry('id')
1875 if not feed_id:
1876 continue
1877 feed_title = feed_entry('title')
1878 title = video_title
1879 if feed_title:
1880 title += ' (%s)' % feed_title
1881 entries.append({
1882 '_type': 'url_transparent',
1883 'ie_key': 'Youtube',
1884 'url': smuggle_url(
1885 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1886 {'force_singlefeed': True}),
1887 'title': title,
1888 })
1889 feed_ids.append(feed_id)
1890 self.to_screen(
1891 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1892 % (', '.join(feed_ids), video_id))
1893 return self.playlist_result(entries, video_id, video_title, video_description)
1894 else:
1895 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1896
1897 if view_count is None:
1898 view_count = extract_view_count(video_info)
1899 if view_count is None and video_details:
1900 view_count = int_or_none(video_details.get('viewCount'))
1901 if view_count is None and microformat:
1902 view_count = int_or_none(microformat.get('viewCount'))
1903
1904 if is_live is None:
1905 is_live = bool_or_none(video_details.get('isLive'))
1906
1907 has_live_chat_replay = False
1908 if not is_live:
1909 yt_initial_data = self._get_yt_initial_data(video_id, video_webpage)
1910 try:
1911 yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
1912 has_live_chat_replay = True
1913 except (KeyError, IndexError, TypeError):
1914 pass
1915
1916 # Check for "rental" videos
1917 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1918 raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
1919
1920 def _extract_filesize(media_url):
1921 return int_or_none(self._search_regex(
1922 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
1923
1924 streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
1925 streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
1926
1927 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1928 self.report_rtmp_download()
1929 formats = [{
1930 'format_id': '_rtmp',
1931 'protocol': 'rtmp',
1932 'url': video_info['conn'][0],
1933 'player_url': player_url,
1934 }]
1935 elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
1936 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
1937 if 'rtmpe%3Dyes' in encoded_url_map:
1938 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
1939 formats = []
1940 formats_spec = {}
1941 fmt_list = video_info.get('fmt_list', [''])[0]
1942 if fmt_list:
1943 for fmt in fmt_list.split(','):
1944 spec = fmt.split('/')
1945 if len(spec) > 1:
1946 width_height = spec[1].split('x')
1947 if len(width_height) == 2:
1948 formats_spec[spec[0]] = {
1949 'resolution': spec[1],
1950 'width': int_or_none(width_height[0]),
1951 'height': int_or_none(width_height[1]),
1952 }
1953 for fmt in streaming_formats:
1954 itag = str_or_none(fmt.get('itag'))
1955 if not itag:
1956 continue
1957 quality = fmt.get('quality')
1958 quality_label = fmt.get('qualityLabel') or quality
1959 formats_spec[itag] = {
1960 'asr': int_or_none(fmt.get('audioSampleRate')),
1961 'filesize': int_or_none(fmt.get('contentLength')),
1962 'format_note': quality_label,
1963 'fps': int_or_none(fmt.get('fps')),
1964 'height': int_or_none(fmt.get('height')),
1965 # bitrate for itag 43 is always 2147483647
1966 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
1967 'width': int_or_none(fmt.get('width')),
1968 }
1969
1970 for fmt in streaming_formats:
1971 if fmt.get('drmFamilies') or fmt.get('drm_families'):
1972 continue
1973 url = url_or_none(fmt.get('url'))
1974
1975 if not url:
1976 cipher = fmt.get('cipher') or fmt.get('signatureCipher')
1977 if not cipher:
1978 continue
1979 url_data = compat_parse_qs(cipher)
1980 url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
1981 if not url:
1982 continue
1983 else:
1984 cipher = None
1985 url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
1986
1987 stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
1988 # Unsupported FORMAT_STREAM_TYPE_OTF
1989 if stream_type == 3:
1990 continue
1991
1992 format_id = fmt.get('itag') or url_data['itag'][0]
1993 if not format_id:
1994 continue
1995 format_id = compat_str(format_id)
1996
1997 if cipher:
1998 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
1999 ASSETS_RE = (
2000 r'<script[^>]+\bsrc=("[^"]+")[^>]+\bname=["\']player_ias/base',
2001 r'"jsUrl"\s*:\s*("[^"]+")',
2002 r'"assets":.+?"js":\s*("[^"]+")')
2003 jsplayer_url_json = self._search_regex(
2004 ASSETS_RE,
2005 embed_webpage if age_gate else video_webpage,
2006 'JS player URL (1)', default=None)
2007 if not jsplayer_url_json and not age_gate:
2008 # We need the embed website after all
2009 if embed_webpage is None:
2010 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
2011 embed_webpage = self._download_webpage(
2012 embed_url, video_id, 'Downloading embed webpage')
2013 jsplayer_url_json = self._search_regex(
2014 ASSETS_RE, embed_webpage, 'JS player URL')
2015
2016 player_url = json.loads(jsplayer_url_json)
2017 if player_url is None:
2018 player_url_json = self._search_regex(
2019 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
2020 video_webpage, 'age gate player URL')
2021 player_url = json.loads(player_url_json)
2022
2023 if 'sig' in url_data:
2024 url += '&signature=' + url_data['sig'][0]
2025 elif 's' in url_data:
2026 encrypted_sig = url_data['s'][0]
2027
2028 if self._downloader.params.get('verbose'):
2029 if player_url is None:
2030 player_desc = 'unknown'
2031 else:
2032 player_type, player_version = self._extract_player_info(player_url)
2033 player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version)
2034 parts_sizes = self._signature_cache_id(encrypted_sig)
2035 self.to_screen('{%s} signature length %s, %s' %
2036 (format_id, parts_sizes, player_desc))
2037
2038 signature = self._decrypt_signature(
2039 encrypted_sig, video_id, player_url, age_gate)
2040 sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
2041 url += '&%s=%s' % (sp, signature)
2042 if 'ratebypass' not in url:
2043 url += '&ratebypass=yes'
2044
2045 dct = {
2046 'format_id': format_id,
2047 'url': url,
2048 'player_url': player_url,
2049 }
2050 if format_id in self._formats:
2051 dct.update(self._formats[format_id])
2052 if format_id in formats_spec:
2053 dct.update(formats_spec[format_id])
2054
2055 # Some itags are not included in DASH manifest thus corresponding formats will
2056 # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).
2057 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
2058 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
2059 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
2060
2061 if width is None:
2062 width = int_or_none(fmt.get('width'))
2063 if height is None:
2064 height = int_or_none(fmt.get('height'))
2065
2066 filesize = int_or_none(url_data.get(
2067 'clen', [None])[0]) or _extract_filesize(url)
2068
2069 quality = url_data.get('quality', [None])[0] or fmt.get('quality')
2070 quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
2071
2072 tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
2073 or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
2074 fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
2075
2076 more_fields = {
2077 'filesize': filesize,
2078 'tbr': tbr,
2079 'width': width,
2080 'height': height,
2081 'fps': fps,
2082 'format_note': quality_label or quality,
2083 }
2084 for key, value in more_fields.items():
2085 if value:
2086 dct[key] = value
2087 type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
2088 if type_:
2089 type_split = type_.split(';')
2090 kind_ext = type_split[0].split('/')
2091 if len(kind_ext) == 2:
2092 kind, _ = kind_ext
2093 dct['ext'] = mimetype2ext(type_split[0])
2094 if kind in ('audio', 'video'):
2095 codecs = None
2096 for mobj in re.finditer(
2097 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
2098 if mobj.group('key') == 'codecs':
2099 codecs = mobj.group('val')
2100 break
2101 if codecs:
2102 dct.update(parse_codecs(codecs))
2103 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
2104 dct['downloader_options'] = {
2105 # Youtube throttles chunks >~10M
2106 'http_chunk_size': 10485760,
2107 }
2108 formats.append(dct)
2109 else:
2110 manifest_url = (
2111 url_or_none(try_get(
2112 player_response,
2113 lambda x: x['streamingData']['hlsManifestUrl'],
2114 compat_str))
2115 or url_or_none(try_get(
2116 video_info, lambda x: x['hlsvp'][0], compat_str)))
2117 if manifest_url:
2118 formats = []
2119 m3u8_formats = self._extract_m3u8_formats(
2120 manifest_url, video_id, 'mp4', fatal=False)
2121 for a_format in m3u8_formats:
2122 itag = self._search_regex(
2123 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
2124 if itag:
2125 a_format['format_id'] = itag
2126 if itag in self._formats:
2127 dct = self._formats[itag].copy()
2128 dct.update(a_format)
2129 a_format = dct
2130 a_format['player_url'] = player_url
2131 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
2132 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
2133 if self._downloader.params.get('youtube_include_hls_manifest', True):
2134 formats.append(a_format)
2135 else:
2136 error_message = extract_unavailable_message()
2137 if not error_message:
2138 error_message = clean_html(try_get(
2139 player_response, lambda x: x['playabilityStatus']['reason'],
2140 compat_str))
2141 if not error_message:
2142 error_message = clean_html(
2143 try_get(video_info, lambda x: x['reason'][0], compat_str))
2144 if error_message:
2145 raise ExtractorError(error_message, expected=True)
2146 raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
2147
2148 # uploader
2149 video_uploader = try_get(
2150 video_info, lambda x: x['author'][0],
2151 compat_str) or str_or_none(video_details.get('author'))
2152 if video_uploader:
2153 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
2154 else:
2155 self._downloader.report_warning('unable to extract uploader name')
2156
2157 # uploader_id
2158 video_uploader_id = None
2159 video_uploader_url = None
2160 mobj = re.search(
2161 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
2162 video_webpage)
2163 if mobj is not None:
2164 video_uploader_id = mobj.group('uploader_id')
2165 video_uploader_url = mobj.group('uploader_url')
2166 else:
2167 owner_profile_url = url_or_none(microformat.get('ownerProfileUrl'))
2168 if owner_profile_url:
2169 video_uploader_id = self._search_regex(
2170 r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id',
2171 default=None)
2172 video_uploader_url = owner_profile_url
2173
2174 channel_id = (
2175 str_or_none(video_details.get('channelId'))
2176 or self._html_search_meta(
2177 'channelId', video_webpage, 'channel id', default=None)
2178 or self._search_regex(
2179 r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
2180 video_webpage, 'channel id', default=None, group='id'))
2181 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
2182
2183 thumbnails = []
2184 thumbnails_list = try_get(
2185 video_details, lambda x: x['thumbnail']['thumbnails'], list) or []
2186 for t in thumbnails_list:
2187 if not isinstance(t, dict):
2188 continue
2189 thumbnail_url = url_or_none(t.get('url'))
2190 if not thumbnail_url:
2191 continue
2192 thumbnails.append({
2193 'url': thumbnail_url,
2194 'width': int_or_none(t.get('width')),
2195 'height': int_or_none(t.get('height')),
2196 })
2197
2198 if not thumbnails:
2199 video_thumbnail = None
2200 # We try first to get a high quality image:
2201 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
2202 video_webpage, re.DOTALL)
2203 if m_thumb is not None:
2204 video_thumbnail = m_thumb.group(1)
2205 thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str)
2206 if thumbnail_url:
2207 video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url)
2208 if video_thumbnail:
2209 thumbnails.append({'url': video_thumbnail})
2210
2211 # upload date
2212 upload_date = self._html_search_meta(
2213 'datePublished', video_webpage, 'upload date', default=None)
2214 if not upload_date:
2215 upload_date = self._search_regex(
2216 [r'(?s)id="eow-date.*?>(.*?)</span>',
2217 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
2218 video_webpage, 'upload date', default=None)
2219 if not upload_date:
2220 upload_date = microformat.get('publishDate') or microformat.get('uploadDate')
2221 upload_date = unified_strdate(upload_date)
2222
2223 video_license = self._html_search_regex(
2224 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
2225 video_webpage, 'license', default=None)
2226
2227 m_music = re.search(
2228 r'''(?x)
2229 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
2230 <ul[^>]*>\s*
2231 <li>(?P<title>.+?)
2232 by (?P<creator>.+?)
2233 (?:
2234 \(.+?\)|
2235 <a[^>]*
2236 (?:
2237 \bhref=["\']/red[^>]*>| # drop possible
2238 >\s*Listen ad-free with YouTube Red # YouTube Red ad
2239 )
2240 .*?
2241 )?</li
2242 ''',
2243 video_webpage)
2244 if m_music:
2245 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
2246 video_creator = clean_html(m_music.group('creator'))
2247 else:
2248 video_alt_title = video_creator = None
2249
2250 def extract_meta(field):
2251 return self._html_search_regex(
2252 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
2253 video_webpage, field, default=None)
2254
2255 track = extract_meta('Song')
2256 artist = extract_meta('Artist')
2257 album = extract_meta('Album')
2258
2259 # Youtube Music Auto-generated description
2260 release_date = release_year = None
2261 if video_description:
2262 mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description)
2263 if mobj:
2264 if not track:
2265 track = mobj.group('track').strip()
2266 if not artist:
2267 artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))
2268 if not album:
2269 album = mobj.group('album'.strip())
2270 release_year = mobj.group('release_year')
2271 release_date = mobj.group('release_date')
2272 if release_date:
2273 release_date = release_date.replace('-', '')
2274 if not release_year:
2275 release_year = int(release_date[:4])
2276 if release_year:
2277 release_year = int(release_year)
2278
2279 yt_initial = self._get_yt_initial_data(video_id, video_webpage)
2280 if yt_initial:
2281 music_metadata = self._get_music_metadata_from_yt_initial(yt_initial)
2282 if len(music_metadata):
2283 album = music_metadata[0].get('album')
2284 artist = music_metadata[0].get('artist')
2285 track = music_metadata[0].get('track')
2286
2287 m_episode = re.search(
2288 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
2289 video_webpage)
2290 if m_episode:
2291 series = unescapeHTML(m_episode.group('series'))
2292 season_number = int(m_episode.group('season'))
2293 episode_number = int(m_episode.group('episode'))
2294 else:
2295 series = season_number = episode_number = None
2296
2297 m_cat_container = self._search_regex(
2298 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
2299 video_webpage, 'categories', default=None)
2300 category = None
2301 if m_cat_container:
2302 category = self._html_search_regex(
2303 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
2304 default=None)
2305 if not category:
2306 category = try_get(
2307 microformat, lambda x: x['category'], compat_str)
2308 video_categories = None if category is None else [category]
2309
2310 video_tags = [
2311 unescapeHTML(m.group('content'))
2312 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
2313 if not video_tags:
2314 video_tags = try_get(video_details, lambda x: x['keywords'], list)
2315
2316 def _extract_count(count_name):
2317 return str_to_int(self._search_regex(
2318 r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
2319 % re.escape(count_name),
2320 video_webpage, count_name, default=None))
2321
2322 like_count = _extract_count('like')
2323 dislike_count = _extract_count('dislike')
2324
2325 if view_count is None:
2326 view_count = str_to_int(self._search_regex(
2327 r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
2328 'view count', default=None))
2329
2330 average_rating = (
2331 float_or_none(video_details.get('averageRating'))
2332 or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
2333
2334 # subtitles
2335 video_subtitles = self.extract_subtitles(
2336 video_id, video_webpage, has_live_chat_replay)
2337 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
2338
2339 video_duration = try_get(
2340 video_info, lambda x: int_or_none(x['length_seconds'][0]))
2341 if not video_duration:
2342 video_duration = int_or_none(video_details.get('lengthSeconds'))
2343 if not video_duration:
2344 video_duration = parse_duration(self._html_search_meta(
2345 'duration', video_webpage, 'video duration'))
2346
2347 # Get Subscriber Count of channel
2348 subscriber_count = parse_count(self._search_regex(
2349 r'"text":"([\d\.]+\w?) subscribers"',
2350 video_webpage,
2351 'subscriber count',
2352 default=None
2353 ))
2354
2355 # annotations
2356 video_annotations = None
2357 if self._downloader.params.get('writeannotations', False):
2358 xsrf_token = self._search_regex(
2359 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2',
2360 video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
2361 invideo_url = try_get(
2362 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
2363 if xsrf_token and invideo_url:
2364 xsrf_field_name = self._search_regex(
2365 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
2366 video_webpage, 'xsrf field name',
2367 group='xsrf_field_name', default='session_token')
2368 video_annotations = self._download_webpage(
2369 self._proto_relative_url(invideo_url),
2370 video_id, note='Downloading annotations',
2371 errnote='Unable to download video annotations', fatal=False,
2372 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
2373
2374 chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
2375
2376 # Look for the DASH manifest
2377 if self._downloader.params.get('youtube_include_dash_manifest', True):
2378 dash_mpd_fatal = True
2379 for mpd_url in dash_mpds:
2380 dash_formats = {}
2381 try:
2382 def decrypt_sig(mobj):
2383 s = mobj.group(1)
2384 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2385 return '/signature/%s' % dec_s
2386
2387 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2388
2389 for df in self._extract_mpd_formats(
2390 mpd_url, video_id, fatal=dash_mpd_fatal,
2391 formats_dict=self._formats):
2392 if not df.get('filesize'):
2393 df['filesize'] = _extract_filesize(df['url'])
2394 # Do not overwrite DASH format found in some previous DASH manifest
2395 if df['format_id'] not in dash_formats:
2396 dash_formats[df['format_id']] = df
2397 # Additional DASH manifests may end up in HTTP Error 403 therefore
2398 # allow them to fail without bug report message if we already have
2399 # some DASH manifest succeeded. This is temporary workaround to reduce
2400 # burst of bug reports until we figure out the reason and whether it
2401 # can be fixed at all.
2402 dash_mpd_fatal = False
2403 except (ExtractorError, KeyError) as e:
2404 self.report_warning(
2405 'Skipping DASH manifest: %r' % e, video_id)
2406 if dash_formats:
2407 # Remove the formats we found through non-DASH, they
2408 # contain less info and it can be wrong, because we use
2409 # fixed values (for example the resolution). See
2410 # https://github.com/ytdl-org/youtube-dl/issues/5774 for an
2411 # example.
2412 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
2413 formats.extend(dash_formats.values())
2414
2415 # Check for malformed aspect ratio
2416 stretched_m = re.search(
2417 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2418 video_webpage)
2419 if stretched_m:
2420 w = float(stretched_m.group('w'))
2421 h = float(stretched_m.group('h'))
2422 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2423 # We will only process correct ratios.
2424 if w > 0 and h > 0:
2425 ratio = w / h
2426 for f in formats:
2427 if f.get('vcodec') != 'none':
2428 f['stretched_ratio'] = ratio
2429
2430 if not formats:
2431 if 'reason' in video_info:
2432 if 'The uploader has not made this video available in your country.' in video_info['reason']:
2433 regions_allowed = self._html_search_meta(
2434 'regionsAllowed', video_webpage, default=None)
2435 countries = regions_allowed.split(',') if regions_allowed else None
2436 self.raise_geo_restricted(
2437 msg=video_info['reason'][0], countries=countries)
2438 reason = video_info['reason'][0]
2439 if 'Invalid parameters' in reason:
2440 unavailable_message = extract_unavailable_message()
2441 if unavailable_message:
2442 reason = unavailable_message
2443 raise ExtractorError(
2444 'YouTube said: %s' % reason,
2445 expected=True, video_id=video_id)
2446 if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']):
2447 raise ExtractorError('This video is DRM protected.', expected=True)
2448
2449 self._sort_formats(formats)
2450
2451 self.mark_watched(video_id, video_info, player_response)
2452
2453 return {
2454 'id': video_id,
2455 'uploader': video_uploader,
2456 'uploader_id': video_uploader_id,
2457 'uploader_url': video_uploader_url,
2458 'channel_id': channel_id,
2459 'channel_url': channel_url,
2460 'upload_date': upload_date,
2461 'license': video_license,
2462 'creator': video_creator or artist,
2463 'title': video_title,
2464 'alt_title': video_alt_title or track,
2465 'thumbnails': thumbnails,
2466 'description': video_description,
2467 'categories': video_categories,
2468 'tags': video_tags,
2469 'subtitles': video_subtitles,
2470 'automatic_captions': automatic_captions,
2471 'duration': video_duration,
2472 'age_limit': 18 if age_gate else 0,
2473 'annotations': video_annotations,
2474 'chapters': chapters,
2475 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
2476 'view_count': view_count,
2477 'like_count': like_count,
2478 'dislike_count': dislike_count,
2479 'average_rating': average_rating,
2480 'formats': formats,
2481 'is_live': is_live,
2482 'start_time': start_time,
2483 'end_time': end_time,
2484 'series': series,
2485 'season_number': season_number,
2486 'episode_number': episode_number,
2487 'track': track,
2488 'artist': artist,
2489 'album': album,
2490 'release_date': release_date,
2491 'release_year': release_year,
2492 'subscriber_count': subscriber_count,
2493 }
2494
2495
2496 class YoutubeTabIE(YoutubeBaseInfoExtractor):
2497 IE_DESC = 'YouTube.com tab'
2498 _VALID_URL = r'https?://(?:\w+\.)?(?:youtube(?:kids)?\.com|invidio\.us)/(?:(?:channel|c|user)/|(?:playlist|watch)\?.*?\blist=)(?P<id>[^/?#&]+)'
2499 IE_NAME = 'youtube:tab'
2500
2501 _TESTS = [{
2502 # playlists, multipage
2503 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid',
2504 'playlist_mincount': 94,
2505 'info_dict': {
2506 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2507 'title': 'Игорь Клейнер - Playlists',
2508 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
2509 },
2510 }, {
2511 # playlists, multipage, different order
2512 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
2513 'playlist_mincount': 94,
2514 'info_dict': {
2515 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2516 'title': 'Игорь Клейнер - Playlists',
2517 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
2518 },
2519 }, {
2520 # playlists, singlepage
2521 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
2522 'playlist_mincount': 4,
2523 'info_dict': {
2524 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
2525 'title': 'ThirstForScience - Playlists',
2526 'description': 'md5:609399d937ea957b0f53cbffb747a14c',
2527 }
2528 }, {
2529 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
2530 'only_matching': True,
2531 }, {
2532 # basic, single video playlist
2533 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2534 'info_dict': {
2535 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2536 'uploader': 'Sergey M.',
2537 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2538 'title': 'youtube-dl public playlist',
2539 },
2540 'playlist_count': 1,
2541 }, {
2542 # empty playlist
2543 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2544 'info_dict': {
2545 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2546 'uploader': 'Sergey M.',
2547 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2548 'title': 'youtube-dl empty playlist',
2549 },
2550 'playlist_count': 0,
2551 }, {
2552 # Home tab
2553 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured',
2554 'info_dict': {
2555 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2556 'title': 'lex will - Home',
2557 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2558 },
2559 'playlist_mincount': 2,
2560 }, {
2561 # Videos tab
2562 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos',
2563 'info_dict': {
2564 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2565 'title': 'lex will - Videos',
2566 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2567 },
2568 'playlist_mincount': 975,
2569 }, {
2570 # Videos tab, sorted by popular
2571 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid',
2572 'info_dict': {
2573 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2574 'title': 'lex will - Videos',
2575 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2576 },
2577 'playlist_mincount': 199,
2578 }, {
2579 # Playlists tab
2580 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists',
2581 'info_dict': {
2582 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2583 'title': 'lex will - Playlists',
2584 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2585 },
2586 'playlist_mincount': 17,
2587 }, {
2588 # Community tab
2589 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community',
2590 'info_dict': {
2591 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2592 'title': 'lex will - Community',
2593 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2594 },
2595 'playlist_mincount': 18,
2596 }, {
2597 # Channels tab
2598 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels',
2599 'info_dict': {
2600 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2601 'title': 'lex will - Channels',
2602 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2603 },
2604 'playlist_mincount': 138,
2605 }, {
2606 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',
2607 'only_matching': True,
2608 }, {
2609 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA',
2610 'only_matching': True,
2611 }, {
2612 'url': 'https://music.youtube.com/channel/UCT-K0qO8z6NzWrywqefBPBQ',
2613 'only_matching': True,
2614 }, {
2615 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2616 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2617 'info_dict': {
2618 'title': '29C3: Not my department',
2619 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2620 'uploader': 'Christiaan008',
2621 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
2622 },
2623 'playlist_count': 96,
2624 }, {
2625 'note': 'Large playlist',
2626 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
2627 'info_dict': {
2628 'title': 'Uploads from Cauchemar',
2629 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
2630 'uploader': 'Cauchemar',
2631 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
2632 },
2633 'playlist_mincount': 1123,
2634 }, {
2635 # even larger playlist, 8832 videos
2636 'url': 'http://www.youtube.com/user/NASAgovVideo/videos',
2637 'only_matching': True,
2638 }, {
2639 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2640 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2641 'info_dict': {
2642 'title': 'Uploads from Interstellar Movie',
2643 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
2644 'uploader': 'Interstellar Movie',
2645 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
2646 },
2647 'playlist_mincount': 21,
2648 }, {
2649 # https://github.com/ytdl-org/youtube-dl/issues/21844
2650 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2651 'info_dict': {
2652 'title': 'Data Analysis with Dr Mike Pound',
2653 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2654 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
2655 'uploader': 'Computerphile',
2656 },
2657 'playlist_mincount': 11,
2658 }, {
2659 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',
2660 'only_matching': True,
2661 }, {
2662 # Playlist URL that does not actually serve a playlist
2663 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2664 'info_dict': {
2665 'id': 'FqZTN594JQw',
2666 'ext': 'webm',
2667 'title': "Smiley's People 01 detective, Adventure Series, Action",
2668 'uploader': 'STREEM',
2669 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
2670 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
2671 'upload_date': '20150526',
2672 'license': 'Standard YouTube License',
2673 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2674 'categories': ['People & Blogs'],
2675 'tags': list,
2676 'view_count': int,
2677 'like_count': int,
2678 'dislike_count': int,
2679 },
2680 'params': {
2681 'skip_download': True,
2682 },
2683 'skip': 'This video is not available.',
2684 'add_ie': [YoutubeIE.ie_key()],
2685 }, {
2686 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
2687 'only_matching': True,
2688 }, {
2689 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
2690 'only_matching': True,
2691 }]
2692
2693 @classmethod
2694 def suitable(cls, url):
2695 return False if YoutubeLiveIE.suitable(url) else super(
2696 YoutubeTabIE, cls).suitable(url)
2697
2698 def _extract_channel_id(self, webpage):
2699 channel_id = self._html_search_meta(
2700 'channelId', webpage, 'channel id', default=None)
2701 if channel_id:
2702 return channel_id
2703 channel_url = self._html_search_meta(
2704 ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url',
2705 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad',
2706 'twitter:app:url:googleplay'), webpage, 'channel url')
2707 return self._search_regex(
2708 r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+',
2709 channel_url, 'channel id')
2710
2711 @staticmethod
2712 def _extract_grid_item_renderer(item):
2713 for item_kind in ('Playlist', 'Video', 'Channel'):
2714 renderer = item.get('grid%sRenderer' % item_kind)
2715 if renderer:
2716 return renderer
2717
2718 def _extract_video(self, renderer):
2719 video_id = renderer.get('videoId')
2720 title = try_get(
2721 renderer,
2722 (lambda x: x['title']['runs'][0]['text'],
2723 lambda x: x['title']['simpleText']), compat_str)
2724 description = try_get(
2725 renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'],
2726 compat_str)
2727 duration = parse_duration(try_get(
2728 renderer, lambda x: x['lengthText']['simpleText'], compat_str))
2729 view_count_text = try_get(
2730 renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
2731 view_count = str_to_int(self._search_regex(
2732 r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
2733 'view count', default=None))
2734 uploader = try_get(
2735 renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
2736 return {
2737 '_type': 'url_transparent',
2738 'ie_key': YoutubeIE.ie_key(),
2739 'id': video_id,
2740 'url': video_id,
2741 'title': title,
2742 'description': description,
2743 'duration': duration,
2744 'view_count': view_count,
2745 'uploader': uploader,
2746 }
2747
2748 def _grid_entries(self, grid_renderer):
2749 for item in grid_renderer['items']:
2750 if not isinstance(item, dict):
2751 continue
2752 renderer = self._extract_grid_item_renderer(item)
2753 if not isinstance(renderer, dict):
2754 continue
2755 title = try_get(
2756 renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
2757 # playlist
2758 playlist_id = renderer.get('playlistId')
2759 if playlist_id:
2760 yield self.url_result(
2761 'https://www.youtube.com/playlist?list=%s' % playlist_id,
2762 ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
2763 video_title=title)
2764 # video
2765 video_id = renderer.get('videoId')
2766 if video_id:
2767 yield self._extract_video(renderer)
2768 # channel
2769 channel_id = renderer.get('channelId')
2770 if channel_id:
2771 title = try_get(
2772 renderer, lambda x: x['title']['simpleText'], compat_str)
2773 yield self.url_result(
2774 'https://www.youtube.com/channel/%s' % channel_id,
2775 ie=YoutubeTabIE.ie_key(), video_title=title)
2776
2777 def _shelf_entries_trimmed(self, shelf_renderer):
2778 renderer = try_get(
2779 shelf_renderer, lambda x: x['content']['horizontalListRenderer'], dict)
2780 if not renderer:
2781 return
2782 # TODO: add support for nested playlists so each shelf is processed
2783 # as separate playlist
2784 # TODO: this includes only first N items
2785 for entry in self._grid_entries(renderer):
2786 yield entry
2787
2788 def _shelf_entries(self, shelf_renderer):
2789 ep = try_get(
2790 shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
2791 compat_str)
2792 shelf_url = urljoin('https://www.youtube.com', ep)
2793 if not shelf_url:
2794 return
2795 title = try_get(
2796 shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
2797 yield self.url_result(shelf_url, video_title=title)
2798
2799 def _playlist_entries(self, video_list_renderer):
2800 for content in video_list_renderer['contents']:
2801 if not isinstance(content, dict):
2802 continue
2803 renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer')
2804 if not isinstance(renderer, dict):
2805 continue
2806 video_id = renderer.get('videoId')
2807 if not video_id:
2808 continue
2809 yield self._extract_video(renderer)
2810
2811 def _video_entry(self, video_renderer):
2812 video_id = video_renderer.get('videoId')
2813 if video_id:
2814 return self._extract_video(video_renderer)
2815
2816 def _post_thread_entries(self, post_thread_renderer):
2817 post_renderer = try_get(
2818 post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict)
2819 if not post_renderer:
2820 return
2821 # video attachment
2822 video_renderer = try_get(
2823 post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict)
2824 video_id = None
2825 if video_renderer:
2826 entry = self._video_entry(video_renderer)
2827 if entry:
2828 yield entry
2829 # inline video links
2830 runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or []
2831 for run in runs:
2832 if not isinstance(run, dict):
2833 continue
2834 ep_url = try_get(
2835 run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str)
2836 if not ep_url:
2837 continue
2838 if not YoutubeIE.suitable(ep_url):
2839 continue
2840 ep_video_id = YoutubeIE._match_id(ep_url)
2841 if video_id == ep_video_id:
2842 continue
2843 yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=video_id)
2844
2845 def _post_thread_continuation_entries(self, post_thread_continuation):
2846 contents = post_thread_continuation.get('contents')
2847 if not isinstance(contents, list):
2848 return
2849 for content in contents:
2850 renderer = content.get('backstagePostThreadRenderer')
2851 if not isinstance(renderer, dict):
2852 continue
2853 for entry in self._post_thread_entries(renderer):
2854 yield entry
2855
2856 @staticmethod
2857 def _extract_next_continuation_data(renderer):
2858 next_continuation = try_get(
2859 renderer, lambda x: x['continuations'][0]['nextContinuationData'], dict)
2860 if not next_continuation:
2861 return
2862 continuation = next_continuation.get('continuation')
2863 if not continuation:
2864 return
2865 ctp = next_continuation.get('clickTrackingParams')
2866 return {
2867 'ctoken': continuation,
2868 'continuation': continuation,
2869 'itct': ctp,
2870 }
2871
2872 @classmethod
2873 def _extract_continuation(cls, renderer):
2874 next_continuation = cls._extract_next_continuation_data(renderer)
2875 if next_continuation:
2876 return next_continuation
2877 contents = renderer.get('contents')
2878 if not isinstance(contents, list):
2879 return
2880 for content in contents:
2881 if not isinstance(content, dict):
2882 continue
2883 continuation_ep = try_get(
2884 content, lambda x: x['continuationItemRenderer']['continuationEndpoint'],
2885 dict)
2886 if not continuation_ep:
2887 continue
2888 continuation = try_get(
2889 continuation_ep, lambda x: x['continuationCommand']['token'], compat_str)
2890 if not continuation:
2891 continue
2892 ctp = continuation_ep.get('clickTrackingParams')
2893 if not ctp:
2894 continue
2895 return {
2896 'ctoken': continuation,
2897 'continuation': continuation,
2898 'itct': ctp,
2899 }
2900
2901 def _entries(self, tab, identity_token):
2902 continuation = None
2903 slr_contents = try_get(tab, lambda x: x['sectionListRenderer']['contents'], list) or []
2904 for slr_content in slr_contents:
2905 if not isinstance(slr_content, dict):
2906 continue
2907 is_renderer = try_get(slr_content, lambda x: x['itemSectionRenderer'], dict)
2908 if not is_renderer:
2909 continue
2910 isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
2911 for isr_content in isr_contents:
2912 if not isinstance(isr_content, dict):
2913 continue
2914 renderer = isr_content.get('playlistVideoListRenderer')
2915 if renderer:
2916 for entry in self._playlist_entries(renderer):
2917 yield entry
2918 continuation = self._extract_continuation(renderer)
2919 continue
2920 renderer = isr_content.get('gridRenderer')
2921 if renderer:
2922 for entry in self._grid_entries(renderer):
2923 yield entry
2924 continuation = self._extract_continuation(renderer)
2925 continue
2926 renderer = isr_content.get('shelfRenderer')
2927 if renderer:
2928 for entry in self._shelf_entries(renderer):
2929 yield entry
2930 continue
2931 renderer = isr_content.get('backstagePostThreadRenderer')
2932 if renderer:
2933 for entry in self._post_thread_entries(renderer):
2934 yield entry
2935 continuation = self._extract_continuation(renderer)
2936 continue
2937 renderer = isr_content.get('videoRenderer')
2938 if renderer:
2939 entry = self._video_entry(renderer)
2940 if entry:
2941 yield entry
2942
2943 if not continuation:
2944 continuation = self._extract_continuation(is_renderer)
2945
2946 headers = {
2947 'x-youtube-client-name': '1',
2948 'x-youtube-client-version': '2.20201112.04.01',
2949 }
2950 if identity_token:
2951 headers['x-youtube-identity-token'] = identity_token
2952
2953 for page_num in itertools.count(1):
2954 if not continuation:
2955 break
2956 browse = self._download_json(
2957 'https://www.youtube.com/browse_ajax', None,
2958 'Downloading page %d' % page_num,
2959 headers=headers, query=continuation, fatal=False)
2960 if not browse:
2961 break
2962 response = try_get(browse, lambda x: x[1]['response'], dict)
2963 if not response:
2964 break
2965
2966 continuation_contents = try_get(
2967 response, lambda x: x['continuationContents'], dict)
2968 if continuation_contents:
2969 continuation_renderer = continuation_contents.get('playlistVideoListContinuation')
2970 if continuation_renderer:
2971 for entry in self._playlist_entries(continuation_renderer):
2972 yield entry
2973 continuation = self._extract_continuation(continuation_renderer)
2974 continue
2975 continuation_renderer = continuation_contents.get('gridContinuation')
2976 if continuation_renderer:
2977 for entry in self._grid_entries(continuation_renderer):
2978 yield entry
2979 continuation = self._extract_continuation(continuation_renderer)
2980 continue
2981 continuation_renderer = continuation_contents.get('itemSectionContinuation')
2982 if continuation_renderer:
2983 for entry in self._post_thread_continuation_entries(continuation_renderer):
2984 yield entry
2985 continuation = self._extract_continuation(continuation_renderer)
2986 continue
2987
2988 continuation_items = try_get(
2989 response, lambda x: x['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems'], list)
2990 if continuation_items:
2991 continuation_item = continuation_items[0]
2992 if not isinstance(continuation_item, dict):
2993 continue
2994 renderer = continuation_item.get('playlistVideoRenderer')
2995 if renderer:
2996 video_list_renderer = {'contents': continuation_items}
2997 for entry in self._playlist_entries(video_list_renderer):
2998 yield entry
2999 continuation = self._extract_continuation(video_list_renderer)
3000 continue
3001
3002 break
3003
3004 @staticmethod
3005 def _extract_selected_tab(tabs):
3006 for tab in tabs:
3007 if try_get(tab, lambda x: x['tabRenderer']['selected'], bool):
3008 return tab['tabRenderer']
3009 else:
3010 raise ExtractorError('Unable to find selected tab')
3011
3012 @staticmethod
3013 def _extract_uploader(data):
3014 uploader = {}
3015 sidebar_renderer = try_get(
3016 data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list)
3017 if sidebar_renderer:
3018 for item in sidebar_renderer:
3019 if not isinstance(item, dict):
3020 continue
3021 renderer = item.get('playlistSidebarSecondaryInfoRenderer')
3022 if not isinstance(renderer, dict):
3023 continue
3024 owner = try_get(
3025 renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict)
3026 if owner:
3027 uploader['uploader'] = owner.get('text')
3028 uploader['uploader_id'] = try_get(
3029 owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str)
3030 uploader['uploader_url'] = urljoin(
3031 'https://www.youtube.com/',
3032 try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
3033 return uploader
3034
3035 def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
3036 selected_tab = self._extract_selected_tab(tabs)
3037 renderer = try_get(
3038 data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
3039 if renderer:
3040 channel_title = renderer.get('title') or item_id
3041 tab_title = selected_tab.get('title')
3042 title = channel_title or item_id
3043 if tab_title:
3044 title += ' - %s' % tab_title
3045 description = renderer.get('description')
3046 playlist_id = renderer.get('externalId')
3047 renderer = try_get(
3048 data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
3049 if renderer:
3050 title = renderer.get('title')
3051 description = None
3052 playlist_id = item_id
3053 playlist = self.playlist_result(
3054 self._entries(selected_tab['content'], identity_token),
3055 playlist_id=playlist_id, playlist_title=title,
3056 playlist_description=description)
3057 playlist.update(self._extract_uploader(data))
3058 return playlist
3059
3060 def _extract_from_playlist(self, item_id, data, playlist):
3061 title = playlist.get('title') or try_get(
3062 data, lambda x: x['titleText']['simpleText'], compat_str)
3063 playlist_id = playlist.get('playlistId') or item_id
3064 return self.playlist_result(
3065 self._playlist_entries(playlist), playlist_id=playlist_id,
3066 playlist_title=title)
3067
3068 def _real_extract(self, url):
3069 item_id = self._match_id(url)
3070 url = compat_urlparse.urlunparse(
3071 compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
3072 # Handle both video/playlist URLs
3073 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3074 video_id = qs.get('v', [None])[0]
3075 playlist_id = qs.get('list', [None])[0]
3076 if video_id and playlist_id:
3077 if self._downloader.params.get('noplaylist'):
3078 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
3079 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
3080 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
3081 webpage = self._download_webpage(url, item_id)
3082 identity_token = self._search_regex(
3083 r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
3084 'identity token', default=None)
3085 data = self._extract_yt_initial_data(item_id, webpage)
3086 tabs = try_get(
3087 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
3088 if tabs:
3089 return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token)
3090 playlist = try_get(
3091 data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
3092 if playlist:
3093 return self._extract_from_playlist(item_id, data, playlist)
3094 # Fallback to video extraction if no playlist alike page is recognized
3095 if video_id:
3096 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
3097 # Failed to recognize
3098 raise ExtractorError('Unable to recognize tab page')
3099
3100
3101 class YoutubePlaylistIE(InfoExtractor):
3102 IE_DESC = 'YouTube.com playlists'
3103 _VALID_URL = r'''(?x)(?:
3104 (?:https?://)?
3105 (?:\w+\.)?
3106 (?:
3107 (?:
3108 youtube(?:kids)?\.com|
3109 invidio\.us|
3110 youtu\.be
3111 )
3112 /.*?\?.*?\blist=
3113 )?
3114 (?P<id>%(playlist_id)s)
3115 )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
3116 IE_NAME = 'youtube:playlist'
3117 _TESTS = [{
3118 'note': 'issue #673',
3119 'url': 'PLBB231211A4F62143',
3120 'info_dict': {
3121 'title': '[OLD]Team Fortress 2 (Class-based LP)',
3122 'id': 'PLBB231211A4F62143',
3123 'uploader': 'Wickydoo',
3124 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
3125 },
3126 'playlist_mincount': 29,
3127 }, {
3128 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
3129 'info_dict': {
3130 'title': 'YDL_safe_search',
3131 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
3132 },
3133 'playlist_count': 2,
3134 'skip': 'This playlist is private',
3135 }, {
3136 'note': 'embedded',
3137 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
3138 'playlist_count': 4,
3139 'info_dict': {
3140 'title': 'JODA15',
3141 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
3142 'uploader': 'milan',
3143 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
3144 }
3145 }, {
3146 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
3147 'playlist_mincount': 982,
3148 'info_dict': {
3149 'title': '2018 Chinese New Singles (11/6 updated)',
3150 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
3151 'uploader': 'LBK',
3152 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
3153 }
3154 }, {
3155 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
3156 'info_dict': {
3157 'id': 'yeWKywCrFtk',
3158 'ext': 'mp4',
3159 'title': 'Small Scale Baler and Braiding Rugs',
3160 'uploader': 'Backus-Page House Museum',
3161 'uploader_id': 'backuspagemuseum',
3162 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
3163 'upload_date': '20161008',
3164 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
3165 'categories': ['Nonprofits & Activism'],
3166 'tags': list,
3167 'like_count': int,
3168 'dislike_count': int,
3169 },
3170 'params': {
3171 'noplaylist': True,
3172 'skip_download': True,
3173 },
3174 }, {
3175 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
3176 'only_matching': True,
3177 }, {
3178 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
3179 'only_matching': True,
3180 }, {
3181 # music album playlist
3182 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
3183 'only_matching': True,
3184 }]
3185
3186 @classmethod
3187 def suitable(cls, url):
3188 return False if YoutubeTabIE.suitable(url) else super(
3189 YoutubePlaylistIE, cls).suitable(url)
3190
3191 def _real_extract(self, url):
3192 playlist_id = self._match_id(url)
3193 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3194 if not qs:
3195 qs = {'list': playlist_id}
3196 return self.url_result(
3197 update_url_query('https://www.youtube.com/playlist', qs),
3198 ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
3199
3200
3201 class YoutubeYtUserIE(InfoExtractor):
3202 _VALID_URL = r'ytuser:(?P<id>.+)'
3203 _TESTS = [{
3204 'url': 'ytuser:phihag',
3205 'only_matching': True,
3206 }]
3207
3208 def _real_extract(self, url):
3209 user_id = self._match_id(url)
3210 return self.url_result(
3211 'https://www.youtube.com/user/%s' % user_id,
3212 ie=YoutubeTabIE.ie_key(), video_id=user_id)
3213
3214
3215 class YoutubeLiveIE(YoutubeBaseInfoExtractor):
3216 IE_DESC = 'YouTube.com live streams'
3217 _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'
3218 IE_NAME = 'youtube:live'
3219
3220 _TESTS = [{
3221 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
3222 'info_dict': {
3223 'id': 'a48o2S1cPoo',
3224 'ext': 'mp4',
3225 'title': 'The Young Turks - Live Main Show',
3226 'uploader': 'The Young Turks',
3227 'uploader_id': 'TheYoungTurks',
3228 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
3229 'upload_date': '20150715',
3230 'license': 'Standard YouTube License',
3231 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
3232 'categories': ['News & Politics'],
3233 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
3234 'like_count': int,
3235 'dislike_count': int,
3236 },
3237 'params': {
3238 'skip_download': True,
3239 },
3240 }, {
3241 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
3242 'only_matching': True,
3243 }, {
3244 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
3245 'only_matching': True,
3246 }, {
3247 'url': 'https://www.youtube.com/TheYoungTurks/live',
3248 'only_matching': True,
3249 }]
3250
3251 def _real_extract(self, url):
3252 mobj = re.match(self._VALID_URL, url)
3253 channel_id = mobj.group('id')
3254 base_url = mobj.group('base_url')
3255 webpage = self._download_webpage(url, channel_id, fatal=False)
3256 if webpage:
3257 page_type = self._og_search_property(
3258 'type', webpage, 'page type', default='')
3259 video_id = self._html_search_meta(
3260 'videoId', webpage, 'video id', default=None)
3261 if page_type.startswith('video') and video_id and re.match(
3262 r'^[0-9A-Za-z_-]{11}$', video_id):
3263 return self.url_result(video_id, YoutubeIE.ie_key())
3264 return self.url_result(base_url)
3265
3266
3267 class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):
3268 IE_DESC = 'YouTube.com searches'
3269 # there doesn't appear to be a real limit, for example if you search for
3270 # 'python' you get more than 8.000.000 results
3271 _MAX_RESULTS = float('inf')
3272 IE_NAME = 'youtube:search'
3273 _SEARCH_KEY = 'ytsearch'
3274 _SEARCH_PARAMS = None
3275 _TESTS = []
3276
3277 def _entries(self, query, n):
3278 data = {
3279 'context': {
3280 'client': {
3281 'clientName': 'WEB',
3282 'clientVersion': '2.20201021.03.00',
3283 }
3284 },
3285 'query': query,
3286 }
3287 if self._SEARCH_PARAMS:
3288 data['params'] = self._SEARCH_PARAMS
3289 total = 0
3290 for page_num in itertools.count(1):
3291 search = self._download_json(
3292 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
3293 video_id='query "%s"' % query,
3294 note='Downloading page %s' % page_num,
3295 errnote='Unable to download API page', fatal=False,
3296 data=json.dumps(data).encode('utf8'),
3297 headers={'content-type': 'application/json'})
3298 if not search:
3299 break
3300 slr_contents = try_get(
3301 search,
3302 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
3303 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
3304 list)
3305 if not slr_contents:
3306 break
3307 isr_contents = try_get(
3308 slr_contents,
3309 lambda x: x[0]['itemSectionRenderer']['contents'],
3310 list)
3311 if not isr_contents:
3312 break
3313 for content in isr_contents:
3314 if not isinstance(content, dict):
3315 continue
3316 video = content.get('videoRenderer')
3317 if not isinstance(video, dict):
3318 continue
3319 video_id = video.get('videoId')
3320 if not video_id:
3321 continue
3322 title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str)
3323 description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str)
3324 duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str))
3325 view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
3326 view_count = int_or_none(self._search_regex(
3327 r'^(\d+)', re.sub(r'\s', '', view_count_text),
3328 'view count', default=None))
3329 uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
3330 total += 1
3331 yield {
3332 '_type': 'url_transparent',
3333 'ie_key': YoutubeIE.ie_key(),
3334 'id': video_id,
3335 'url': video_id,
3336 'title': title,
3337 'description': description,
3338 'duration': duration,
3339 'view_count': view_count,
3340 'uploader': uploader,
3341 }
3342 if total == n:
3343 return
3344 token = try_get(
3345 slr_contents,
3346 lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
3347 compat_str)
3348 if not token:
3349 break
3350 data['continuation'] = token
3351
3352 def _get_n_results(self, query, n):
3353 """Get a specified number of results for a query"""
3354 return self.playlist_result(self._entries(query, n), query)
3355
3356
3357 class YoutubeSearchDateIE(YoutubeSearchIE):
3358 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
3359 _SEARCH_KEY = 'ytsearchdate'
3360 IE_DESC = 'YouTube.com searches, newest videos first'
3361 _SEARCH_PARAMS = 'CAI%3D'
3362
3363
3364 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
3365 """
3366 Base class for feed extractors
3367 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
3368 """
3369 _LOGIN_REQUIRED = True
3370
3371 @property
3372 def IE_NAME(self):
3373 return 'youtube:%s' % self._FEED_NAME
3374
3375 def _real_initialize(self):
3376 self._login()
3377
3378 def _entries(self, page):
3379 # The extraction process is the same as for playlists, but the regex
3380 # for the video ids doesn't contain an index
3381 ids = []
3382 more_widget_html = content_html = page
3383 for page_num in itertools.count(1):
3384 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
3385
3386 # 'recommended' feed has infinite 'load more' and each new portion spins
3387 # the same videos in (sometimes) slightly different order, so we'll check
3388 # for unicity and break when portion has no new videos
3389 new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches)))
3390 if not new_ids:
3391 break
3392
3393 ids.extend(new_ids)
3394
3395 for entry in self._ids_to_results(new_ids):
3396 yield entry
3397
3398 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
3399 if not mobj:
3400 break
3401
3402 more = self._download_json(
3403 'https://www.youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
3404 'Downloading page #%s' % page_num,
3405 transform_source=uppercase_escape,
3406 headers=self._YOUTUBE_CLIENT_HEADERS)
3407 content_html = more['content_html']
3408 more_widget_html = more['load_more_widget_html']
3409
3410 def _real_extract(self, url):
3411 page = self._download_webpage(
3412 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
3413 self._PLAYLIST_TITLE)
3414 return self.playlist_result(
3415 self._entries(page), playlist_title=self._PLAYLIST_TITLE)
3416
3417
3418 class YoutubeWatchLaterIE(InfoExtractor):
3419 IE_NAME = 'youtube:watchlater'
3420 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
3421 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/watch_later|:ytwatchlater'
3422
3423 _TESTS = [{
3424 'url': 'https://www.youtube.com/feed/watch_later',
3425 'only_matching': True,
3426 }, {
3427 'url': ':ytwatchlater',
3428 'only_matching': True,
3429 }]
3430
3431 def _real_extract(self, url):
3432 return self.url_result(
3433 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key())
3434 _, video = self._check_download_just_video(url, 'WL')
3435 if video:
3436 return video
3437 _, playlist = self._extract_playlist('WL')
3438 return playlist
3439
3440
3441 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3442 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
3443 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'
3444 _FEED_NAME = 'recommended'
3445 _PLAYLIST_TITLE = 'Youtube Recommended videos'
3446
3447
3448 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
3449 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
3450 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
3451 _FEED_NAME = 'subscriptions'
3452 _PLAYLIST_TITLE = 'Youtube Subscriptions'
3453
3454
3455 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
3456 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
3457 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
3458 _FEED_NAME = 'history'
3459 _PLAYLIST_TITLE = 'Youtube History'
3460
3461
3462 class YoutubeTruncatedURLIE(InfoExtractor):
3463 IE_NAME = 'youtube:truncated_url'
3464 IE_DESC = False # Do not list
3465 _VALID_URL = r'''(?x)
3466 (?:https?://)?
3467 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3468 (?:watch\?(?:
3469 feature=[a-z_]+|
3470 annotation_id=annotation_[^&]+|
3471 x-yt-cl=[0-9]+|
3472 hl=[^&]*|
3473 t=[0-9]+
3474 )?
3475 |
3476 attribution_link\?a=[^&]+
3477 )
3478 $
3479 '''
3480
3481 _TESTS = [{
3482 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
3483 'only_matching': True,
3484 }, {
3485 'url': 'https://www.youtube.com/watch?',
3486 'only_matching': True,
3487 }, {
3488 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3489 'only_matching': True,
3490 }, {
3491 'url': 'https://www.youtube.com/watch?feature=foo',
3492 'only_matching': True,
3493 }, {
3494 'url': 'https://www.youtube.com/watch?hl=en-GB',
3495 'only_matching': True,
3496 }, {
3497 'url': 'https://www.youtube.com/watch?t=2372',
3498 'only_matching': True,
3499 }]
3500
3501 def _real_extract(self, url):
3502 raise ExtractorError(
3503 'Did you forget to quote the URL? Remember that & is a meta '
3504 'character in most shells, so you want to put the URL in quotes, '
3505 'like youtube-dl '
3506 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3507 ' or simply youtube-dl BaW_jenozKc .',
3508 expected=True)
3509
3510
3511 class YoutubeTruncatedIDIE(InfoExtractor):
3512 IE_NAME = 'youtube:truncated_id'
3513 IE_DESC = False # Do not list
3514 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
3515
3516 _TESTS = [{
3517 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3518 'only_matching': True,
3519 }]
3520
3521 def _real_extract(self, url):
3522 video_id = self._match_id(url)
3523 raise ExtractorError(
3524 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3525 expected=True)
3526
3527
3528 # Old extractors. Are these cases handled elsewhere?
3529
3530 class YoutubeSearchURLIE(YoutubeSearchIE):
3531 IE_DESC = 'YouTube.com search URLs'
3532 IE_NAME = 'youtube:search_url'
3533 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
3534 _TESTS = [{
3535 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
3536 'playlist_mincount': 5,
3537 'info_dict': {
3538 'title': 'youtube-dl test video',
3539 }
3540 }, {
3541 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
3542 'only_matching': True,
3543 }]
3544
3545 def _process_json_dict(self, obj, videos, c):
3546 if "videoId" in obj:
3547 videos.append(obj)
3548 return
3549
3550 if "nextContinuationData" in obj:
3551 c["continuation"] = obj["nextContinuationData"]
3552 return
3553
3554 def _real_extract(self, url):
3555 mobj = re.match(self._VALID_URL, url)
3556 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
3557 webpage = self._download_webpage(url, query)
3558 return self.playlist_result(self._entries(webpage, query, max_pages=5), playlist_title=query)
3559
3560
3561 class YoutubeShowIE(InfoExtractor):
3562 IE_DESC = 'YouTube.com (multi-season) shows'
3563 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
3564 IE_NAME = 'youtube:show'
3565 _TESTS = [{
3566 'url': 'https://www.youtube.com/show/airdisasters',
3567 'playlist_mincount': 5,
3568 'info_dict': {
3569 'id': 'airdisasters',
3570 'title': 'Air Disasters',
3571 }
3572 }]
3573
3574 def _real_extract(self, url):
3575 playlist_id = self._match_id(url)
3576 return super(YoutubeShowIE, self)._real_extract(
3577 'https://www.youtube.com/show/%s/playlists' % playlist_id)
3578
3579
3580 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
3581 IE_NAME = 'youtube:favorites'
3582 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
3583 _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
3584 _LOGIN_REQUIRED = True
3585
3586 def _real_extract(self, url):
3587 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
3588 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
3589 return self.url_result(playlist_id, 'YoutubePlaylist')