]> jfr.im git - yt-dlp.git/blame_incremental - youtube_dlc/extractor/youtube.py
Implemented all Youtube Feeds (ytfav, ytwatchlater, ytsubs, ythistory, ytrec) and...
[yt-dlp.git] / youtube_dlc / extractor / youtube.py
... / ...
CommitLineData
1# coding: utf-8
2
3from __future__ import unicode_literals
4
5
6import itertools
7import json
8import os.path
9import random
10import re
11import time
12import traceback
13
14from .common import InfoExtractor, SearchInfoExtractor
15from ..jsinterp import JSInterpreter
16from ..swfinterp import SWFInterpreter
17from ..compat import (
18 compat_chr,
19 compat_kwargs,
20 compat_parse_qs,
21 compat_urllib_parse_unquote,
22 compat_urllib_parse_unquote_plus,
23 compat_urllib_parse_urlencode,
24 compat_urllib_parse_urlparse,
25 compat_urlparse,
26 compat_str,
27)
28from ..utils import (
29 bool_or_none,
30 clean_html,
31 error_to_compat_str,
32 ExtractorError,
33 float_or_none,
34 get_element_by_id,
35 int_or_none,
36 mimetype2ext,
37 parse_codecs,
38 parse_count,
39 parse_duration,
40 remove_quotes,
41 remove_start,
42 smuggle_url,
43 str_or_none,
44 str_to_int,
45 try_get,
46 unescapeHTML,
47 unified_strdate,
48 unsmuggle_url,
49 update_url_query,
50 uppercase_escape,
51 url_or_none,
52 urlencode_postdata,
53 urljoin,
54)
55
56
57class YoutubeBaseInfoExtractor(InfoExtractor):
58 """Provide base functions for Youtube extractors"""
59 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
60 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
61
62 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
63 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
64 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
65
66 _RESERVED_NAMES = (
67 r'course|embed|watch|w|results|storefront|'
68 r'shared|index|account|reporthistory|t/terms|about|upload|signin|logout|'
69 r'feed/(watch_later|history|subscriptions|library|trending|recommended)')
70
71 _NETRC_MACHINE = 'youtube'
72 # If True it will raise an error if no login info is provided
73 _LOGIN_REQUIRED = False
74
75 _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|LL|WL)'
76
77 _YOUTUBE_CLIENT_HEADERS = {
78 'x-youtube-client-name': '1',
79 'x-youtube-client-version': '1.20200609.04.02',
80 }
81
82 def _set_language(self):
83 self._set_cookie(
84 '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
85 # YouTube sets the expire time to about two months
86 expire_time=time.time() + 2 * 30 * 24 * 3600)
87
88 def _ids_to_results(self, ids):
89 return [
90 self.url_result(vid_id, 'Youtube', video_id=vid_id)
91 for vid_id in ids]
92
93 def _login(self):
94 """
95 Attempt to log in to YouTube.
96 True is returned if successful or skipped.
97 False is returned if login failed.
98
99 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
100 """
101 username, password = self._get_login_info()
102 # No authentication to be performed
103 if username is None:
104 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
105 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
106 if self._downloader.params.get('cookiefile') and False: # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
107 self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
108 return True
109
110 login_page = self._download_webpage(
111 self._LOGIN_URL, None,
112 note='Downloading login page',
113 errnote='unable to fetch login page', fatal=False)
114 if login_page is False:
115 return
116
117 login_form = self._hidden_inputs(login_page)
118
119 def req(url, f_req, note, errnote):
120 data = login_form.copy()
121 data.update({
122 'pstMsg': 1,
123 'checkConnection': 'youtube',
124 'checkedDomains': 'youtube',
125 'hl': 'en',
126 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
127 'f.req': json.dumps(f_req),
128 'flowName': 'GlifWebSignIn',
129 'flowEntry': 'ServiceLogin',
130 # TODO: reverse actual botguard identifier generation algo
131 'bgRequest': '["identifier",""]',
132 })
133 return self._download_json(
134 url, None, note=note, errnote=errnote,
135 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
136 fatal=False,
137 data=urlencode_postdata(data), headers={
138 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
139 'Google-Accounts-XSRF': 1,
140 })
141
142 def warn(message):
143 self._downloader.report_warning(message)
144
145 lookup_req = [
146 username,
147 None, [], None, 'US', None, None, 2, False, True,
148 [
149 None, None,
150 [2, 1, None, 1,
151 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
152 None, [], 4],
153 1, [None, None, []], None, None, None, True
154 ],
155 username,
156 ]
157
158 lookup_results = req(
159 self._LOOKUP_URL, lookup_req,
160 'Looking up account info', 'Unable to look up account info')
161
162 if lookup_results is False:
163 return False
164
165 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
166 if not user_hash:
167 warn('Unable to extract user hash')
168 return False
169
170 challenge_req = [
171 user_hash,
172 None, 1, None, [1, None, None, None, [password, None, True]],
173 [
174 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
175 1, [None, None, []], None, None, None, True
176 ]]
177
178 challenge_results = req(
179 self._CHALLENGE_URL, challenge_req,
180 'Logging in', 'Unable to log in')
181
182 if challenge_results is False:
183 return
184
185 login_res = try_get(challenge_results, lambda x: x[0][5], list)
186 if login_res:
187 login_msg = try_get(login_res, lambda x: x[5], compat_str)
188 warn(
189 'Unable to login: %s' % 'Invalid password'
190 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
191 return False
192
193 res = try_get(challenge_results, lambda x: x[0][-1], list)
194 if not res:
195 warn('Unable to extract result entry')
196 return False
197
198 login_challenge = try_get(res, lambda x: x[0][0], list)
199 if login_challenge:
200 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
201 if challenge_str == 'TWO_STEP_VERIFICATION':
202 # SEND_SUCCESS - TFA code has been successfully sent to phone
203 # QUOTA_EXCEEDED - reached the limit of TFA codes
204 status = try_get(login_challenge, lambda x: x[5], compat_str)
205 if status == 'QUOTA_EXCEEDED':
206 warn('Exceeded the limit of TFA codes, try later')
207 return False
208
209 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
210 if not tl:
211 warn('Unable to extract TL')
212 return False
213
214 tfa_code = self._get_tfa_info('2-step verification code')
215
216 if not tfa_code:
217 warn(
218 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
219 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
220 return False
221
222 tfa_code = remove_start(tfa_code, 'G-')
223
224 tfa_req = [
225 user_hash, None, 2, None,
226 [
227 9, None, None, None, None, None, None, None,
228 [None, tfa_code, True, 2]
229 ]]
230
231 tfa_results = req(
232 self._TFA_URL.format(tl), tfa_req,
233 'Submitting TFA code', 'Unable to submit TFA code')
234
235 if tfa_results is False:
236 return False
237
238 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
239 if tfa_res:
240 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
241 warn(
242 'Unable to finish TFA: %s' % 'Invalid TFA code'
243 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
244 return False
245
246 check_cookie_url = try_get(
247 tfa_results, lambda x: x[0][-1][2], compat_str)
248 else:
249 CHALLENGES = {
250 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
251 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
252 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
253 }
254 challenge = CHALLENGES.get(
255 challenge_str,
256 '%s returned error %s.' % (self.IE_NAME, challenge_str))
257 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
258 return False
259 else:
260 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
261
262 if not check_cookie_url:
263 warn('Unable to extract CheckCookie URL')
264 return False
265
266 check_cookie_results = self._download_webpage(
267 check_cookie_url, None, 'Checking cookie', fatal=False)
268
269 if check_cookie_results is False:
270 return False
271
272 if 'https://myaccount.google.com/' not in check_cookie_results:
273 warn('Unable to log in')
274 return False
275
276 return True
277
278 def _download_webpage_handle(self, *args, **kwargs):
279 query = kwargs.get('query', {}).copy()
280 kwargs['query'] = query
281 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
282 *args, **compat_kwargs(kwargs))
283
284 def _get_yt_initial_data(self, video_id, webpage):
285 config = self._search_regex(
286 (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
287 r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
288 webpage, 'ytInitialData', default=None)
289 if config:
290 return self._parse_json(
291 uppercase_escape(config), video_id, fatal=False)
292
293 def _real_initialize(self):
294 if self._downloader is None:
295 return
296 self._set_language()
297 if not self._login():
298 return
299
300 _DEFAULT_API_DATA = {
301 'context': {
302 'client': {
303 'clientName': 'WEB',
304 'clientVersion': '2.20201021.03.00',
305 }
306 },
307 }
308
309 def _call_api(self, ep, query, video_id):
310 data = self._DEFAULT_API_DATA.copy()
311 data.update(query)
312
313 response = self._download_json(
314 'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id,
315 note='Downloading API JSON', errnote='Unable to download API page',
316 data=json.dumps(data).encode('utf8'),
317 headers={'content-type': 'application/json'},
318 query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'})
319
320 return response
321
322 def _extract_yt_initial_data(self, video_id, webpage):
323 return self._parse_json(
324 self._search_regex(
325 r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;',
326 webpage, 'yt initial data'),
327 video_id)
328
329
330class YoutubeIE(YoutubeBaseInfoExtractor):
331 IE_DESC = 'YouTube.com'
332 _VALID_URL = r"""(?x)^
333 (
334 (?:https?://|//) # http(s):// or protocol-independent URL
335 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
336 (?:www\.)?deturl\.com/www\.youtube\.com/|
337 (?:www\.)?pwnyoutube\.com/|
338 (?:www\.)?hooktube\.com/|
339 (?:www\.)?yourepeat\.com/|
340 tube\.majestyc\.net/|
341 # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
342 (?:(?:www|dev)\.)?invidio\.us/|
343 (?:(?:www|no)\.)?invidiou\.sh/|
344 (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/|
345 (?:www\.)?invidious\.kabi\.tk/|
346 (?:www\.)?invidious\.13ad\.de/|
347 (?:www\.)?invidious\.mastodon\.host/|
348 (?:www\.)?invidious\.nixnet\.xyz/|
349 (?:www\.)?invidious\.drycat\.fr/|
350 (?:www\.)?tube\.poal\.co/|
351 (?:www\.)?vid\.wxzm\.sx/|
352 (?:www\.)?yewtu\.be/|
353 (?:www\.)?yt\.elukerio\.org/|
354 (?:www\.)?yt\.lelux\.fi/|
355 (?:www\.)?invidious\.ggc-project\.de/|
356 (?:www\.)?yt\.maisputain\.ovh/|
357 (?:www\.)?invidious\.13ad\.de/|
358 (?:www\.)?invidious\.toot\.koeln/|
359 (?:www\.)?invidious\.fdn\.fr/|
360 (?:www\.)?watch\.nettohikari\.com/|
361 (?:www\.)?kgg2m7yk5aybusll\.onion/|
362 (?:www\.)?qklhadlycap4cnod\.onion/|
363 (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
364 (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
365 (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
366 (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
367 (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
368 (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
369 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
370 (?:.*?\#/)? # handle anchor (#/) redirect urls
371 (?: # the various things that can precede the ID:
372 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
373 |(?: # or the v= param in all its forms
374 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
375 (?:\?|\#!?) # the params delimiter ? or # or #!
376 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
377 v=
378 )
379 ))
380 |(?:
381 youtu\.be| # just youtu.be/xxxx
382 vid\.plus| # or vid.plus/xxxx
383 zwearz\.com/watch| # or zwearz.com/watch/xxxx
384 )/
385 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
386 )
387 )? # all until now is optional -> you can pass the naked ID
388 (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
389 (?!.*?\blist=
390 (?:
391 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
392 WL # WL are handled by the watch later IE
393 )
394 )
395 (?(1).+)? # if we found the ID, everything can follow
396 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
397 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
398 _PLAYER_INFO_RE = (
399 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P<ext>[a-z]+)$',
400 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.(?P<ext>[a-z]+)$',
401 )
402 _formats = {
403 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
404 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
405 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
406 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
407 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
408 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
409 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
410 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
411 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
412 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
413 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
414 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
415 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
416 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
417 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
418 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
419 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
420 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
421
422
423 # 3D videos
424 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
425 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
426 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
427 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
428 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
429 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
430 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
431
432 # Apple HTTP Live Streaming
433 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
434 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
435 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
436 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
437 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
438 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
439 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
440 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
441
442 # DASH mp4 video
443 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
444 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
445 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
446 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
447 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
448 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
449 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
450 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
451 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
452 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
453 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
454 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
455
456 # Dash mp4 audio
457 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
458 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
459 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
460 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
461 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
462 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
463 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
464
465 # Dash webm
466 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
467 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
468 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
469 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
470 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
471 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
472 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
473 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
474 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
475 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
476 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
477 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
478 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
479 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
480 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
481 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
482 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
483 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
484 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
485 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
486 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
487 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
488
489 # Dash webm audio
490 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
491 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
492
493 # Dash webm audio with opus inside
494 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
495 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
496 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
497
498 # RTMP (unnamed)
499 '_rtmp': {'protocol': 'rtmp'},
500
501 # av01 video only formats sometimes served with "unknown" codecs
502 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
503 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
504 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
505 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
506 }
507 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
508
509 _GEO_BYPASS = False
510
511 IE_NAME = 'youtube'
512 _TESTS = [
513 {
514 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
515 'info_dict': {
516 'id': 'BaW_jenozKc',
517 'ext': 'mp4',
518 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
519 'uploader': 'Philipp Hagemeister',
520 'uploader_id': 'phihag',
521 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
522 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
523 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
524 'upload_date': '20121002',
525 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
526 'categories': ['Science & Technology'],
527 'tags': ['youtube-dl'],
528 'duration': 10,
529 'view_count': int,
530 'like_count': int,
531 'dislike_count': int,
532 'start_time': 1,
533 'end_time': 9,
534 }
535 },
536 {
537 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
538 'note': 'Embed-only video (#1746)',
539 'info_dict': {
540 'id': 'yZIXLfi8CZQ',
541 'ext': 'mp4',
542 'upload_date': '20120608',
543 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
544 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
545 'uploader': 'SET India',
546 'uploader_id': 'setindia',
547 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
548 'age_limit': 18,
549 }
550 },
551 {
552 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ',
553 'note': 'Use the first video ID in the URL',
554 'info_dict': {
555 'id': 'BaW_jenozKc',
556 'ext': 'mp4',
557 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
558 'uploader': 'Philipp Hagemeister',
559 'uploader_id': 'phihag',
560 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
561 'upload_date': '20121002',
562 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
563 'categories': ['Science & Technology'],
564 'tags': ['youtube-dl'],
565 'duration': 10,
566 'view_count': int,
567 'like_count': int,
568 'dislike_count': int,
569 },
570 'params': {
571 'skip_download': True,
572 },
573 },
574 {
575 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
576 'note': '256k DASH audio (format 141) via DASH manifest',
577 'info_dict': {
578 'id': 'a9LDPn-MO4I',
579 'ext': 'm4a',
580 'upload_date': '20121002',
581 'uploader_id': '8KVIDEO',
582 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
583 'description': '',
584 'uploader': '8KVIDEO',
585 'title': 'UHDTV TEST 8K VIDEO.mp4'
586 },
587 'params': {
588 'youtube_include_dash_manifest': True,
589 'format': '141',
590 },
591 'skip': 'format 141 not served anymore',
592 },
593 # DASH manifest with encrypted signature
594 {
595 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
596 'info_dict': {
597 'id': 'IB3lcPjvWLA',
598 'ext': 'm4a',
599 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
600 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
601 'duration': 244,
602 'uploader': 'AfrojackVEVO',
603 'uploader_id': 'AfrojackVEVO',
604 'upload_date': '20131011',
605 },
606 'params': {
607 'youtube_include_dash_manifest': True,
608 'format': '141/bestaudio[ext=m4a]',
609 },
610 },
611 # Controversy video
612 {
613 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
614 'info_dict': {
615 'id': 'T4XJQO3qol8',
616 'ext': 'mp4',
617 'duration': 219,
618 'upload_date': '20100909',
619 'uploader': 'Amazing Atheist',
620 'uploader_id': 'TheAmazingAtheist',
621 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
622 'title': 'Burning Everyone\'s Koran',
623 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
624 }
625 },
626 # Normal age-gate video (embed allowed)
627 {
628 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
629 'info_dict': {
630 'id': 'HtVdAasjOgU',
631 'ext': 'mp4',
632 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
633 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
634 'duration': 142,
635 'uploader': 'The Witcher',
636 'uploader_id': 'WitcherGame',
637 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
638 'upload_date': '20140605',
639 'age_limit': 18,
640 },
641 },
642 # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
643 # YouTube Red ad is not captured for creator
644 {
645 'url': '__2ABJjxzNo',
646 'info_dict': {
647 'id': '__2ABJjxzNo',
648 'ext': 'mp4',
649 'duration': 266,
650 'upload_date': '20100430',
651 'uploader_id': 'deadmau5',
652 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
653 'creator': 'Dada Life, deadmau5',
654 'description': 'md5:12c56784b8032162bb936a5f76d55360',
655 'uploader': 'deadmau5',
656 'title': 'Deadmau5 - Some Chords (HD)',
657 'alt_title': 'This Machine Kills Some Chords',
658 },
659 'expected_warnings': [
660 'DASH manifest missing',
661 ]
662 },
663 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
664 {
665 'url': 'lqQg6PlCWgI',
666 'info_dict': {
667 'id': 'lqQg6PlCWgI',
668 'ext': 'mp4',
669 'duration': 6085,
670 'upload_date': '20150827',
671 'uploader_id': 'olympic',
672 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
673 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
674 'uploader': 'Olympic',
675 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
676 },
677 'params': {
678 'skip_download': 'requires avconv',
679 }
680 },
681 # Non-square pixels
682 {
683 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
684 'info_dict': {
685 'id': '_b-2C3KPAM0',
686 'ext': 'mp4',
687 'stretched_ratio': 16 / 9.,
688 'duration': 85,
689 'upload_date': '20110310',
690 'uploader_id': 'AllenMeow',
691 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
692 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
693 'uploader': '孫ᄋᄅ',
694 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
695 },
696 },
697 # url_encoded_fmt_stream_map is empty string
698 {
699 'url': 'qEJwOuvDf7I',
700 'info_dict': {
701 'id': 'qEJwOuvDf7I',
702 'ext': 'webm',
703 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
704 'description': '',
705 'upload_date': '20150404',
706 'uploader_id': 'spbelect',
707 'uploader': 'Наблюдатели Петербурга',
708 },
709 'params': {
710 'skip_download': 'requires avconv',
711 },
712 'skip': 'This live event has ended.',
713 },
714 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
715 {
716 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
717 'info_dict': {
718 'id': 'FIl7x6_3R5Y',
719 'ext': 'webm',
720 'title': 'md5:7b81415841e02ecd4313668cde88737a',
721 'description': 'md5:116377fd2963b81ec4ce64b542173306',
722 'duration': 220,
723 'upload_date': '20150625',
724 'uploader_id': 'dorappi2000',
725 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
726 'uploader': 'dorappi2000',
727 'formats': 'mincount:31',
728 },
729 'skip': 'not actual anymore',
730 },
731 # DASH manifest with segment_list
732 {
733 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
734 'md5': '8ce563a1d667b599d21064e982ab9e31',
735 'info_dict': {
736 'id': 'CsmdDsKjzN8',
737 'ext': 'mp4',
738 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
739 'uploader': 'Airtek',
740 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
741 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
742 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
743 },
744 'params': {
745 'youtube_include_dash_manifest': True,
746 'format': '135', # bestvideo
747 },
748 'skip': 'This live event has ended.',
749 },
750 {
751 # Multifeed videos (multiple cameras), URL is for Main Camera
752 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
753 'info_dict': {
754 'id': 'jqWvoWXjCVs',
755 'title': 'teamPGP: Rocket League Noob Stream',
756 'description': 'md5:dc7872fb300e143831327f1bae3af010',
757 },
758 'playlist': [{
759 'info_dict': {
760 'id': 'jqWvoWXjCVs',
761 'ext': 'mp4',
762 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
763 'description': 'md5:dc7872fb300e143831327f1bae3af010',
764 'duration': 7335,
765 'upload_date': '20150721',
766 'uploader': 'Beer Games Beer',
767 'uploader_id': 'beergamesbeer',
768 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
769 'license': 'Standard YouTube License',
770 },
771 }, {
772 'info_dict': {
773 'id': '6h8e8xoXJzg',
774 'ext': 'mp4',
775 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
776 'description': 'md5:dc7872fb300e143831327f1bae3af010',
777 'duration': 7337,
778 'upload_date': '20150721',
779 'uploader': 'Beer Games Beer',
780 'uploader_id': 'beergamesbeer',
781 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
782 'license': 'Standard YouTube License',
783 },
784 }, {
785 'info_dict': {
786 'id': 'PUOgX5z9xZw',
787 'ext': 'mp4',
788 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
789 'description': 'md5:dc7872fb300e143831327f1bae3af010',
790 'duration': 7337,
791 'upload_date': '20150721',
792 'uploader': 'Beer Games Beer',
793 'uploader_id': 'beergamesbeer',
794 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
795 'license': 'Standard YouTube License',
796 },
797 }, {
798 'info_dict': {
799 'id': 'teuwxikvS5k',
800 'ext': 'mp4',
801 'title': 'teamPGP: Rocket League Noob Stream (zim)',
802 'description': 'md5:dc7872fb300e143831327f1bae3af010',
803 'duration': 7334,
804 'upload_date': '20150721',
805 'uploader': 'Beer Games Beer',
806 'uploader_id': 'beergamesbeer',
807 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
808 'license': 'Standard YouTube License',
809 },
810 }],
811 'params': {
812 'skip_download': True,
813 },
814 'skip': 'This video is not available.',
815 },
816 {
817 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
818 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
819 'info_dict': {
820 'id': 'gVfLd0zydlo',
821 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
822 },
823 'playlist_count': 2,
824 'skip': 'Not multifeed anymore',
825 },
826 {
827 'url': 'https://vid.plus/FlRa-iH7PGw',
828 'only_matching': True,
829 },
830 {
831 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
832 'only_matching': True,
833 },
834 {
835 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
836 # Also tests cut-off URL expansion in video description (see
837 # https://github.com/ytdl-org/youtube-dl/issues/1892,
838 # https://github.com/ytdl-org/youtube-dl/issues/8164)
839 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
840 'info_dict': {
841 'id': 'lsguqyKfVQg',
842 'ext': 'mp4',
843 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
844 'alt_title': 'Dark Walk - Position Music',
845 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
846 'duration': 133,
847 'upload_date': '20151119',
848 'uploader_id': 'IronSoulElf',
849 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
850 'uploader': 'IronSoulElf',
851 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
852 'track': 'Dark Walk - Position Music',
853 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
854 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
855 },
856 'params': {
857 'skip_download': True,
858 },
859 },
860 {
861 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
862 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
863 'only_matching': True,
864 },
865 {
866 # Video with yt:stretch=17:0
867 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
868 'info_dict': {
869 'id': 'Q39EVAstoRM',
870 'ext': 'mp4',
871 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
872 'description': 'md5:ee18a25c350637c8faff806845bddee9',
873 'upload_date': '20151107',
874 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
875 'uploader': 'CH GAMER DROID',
876 },
877 'params': {
878 'skip_download': True,
879 },
880 'skip': 'This video does not exist.',
881 },
882 {
883 # Video licensed under Creative Commons
884 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
885 'info_dict': {
886 'id': 'M4gD1WSo5mA',
887 'ext': 'mp4',
888 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
889 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
890 'duration': 721,
891 'upload_date': '20150127',
892 'uploader_id': 'BerkmanCenter',
893 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
894 'uploader': 'The Berkman Klein Center for Internet & Society',
895 'license': 'Creative Commons Attribution license (reuse allowed)',
896 },
897 'params': {
898 'skip_download': True,
899 },
900 },
901 {
902 # Channel-like uploader_url
903 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
904 'info_dict': {
905 'id': 'eQcmzGIKrzg',
906 'ext': 'mp4',
907 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
908 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
909 'duration': 4060,
910 'upload_date': '20151119',
911 'uploader': 'Bernie Sanders',
912 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
913 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
914 'license': 'Creative Commons Attribution license (reuse allowed)',
915 },
916 'params': {
917 'skip_download': True,
918 },
919 },
920 {
921 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
922 'only_matching': True,
923 },
924 {
925 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
926 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
927 'only_matching': True,
928 },
929 {
930 # Rental video preview
931 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
932 'info_dict': {
933 'id': 'uGpuVWrhIzE',
934 'ext': 'mp4',
935 'title': 'Piku - Trailer',
936 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
937 'upload_date': '20150811',
938 'uploader': 'FlixMatrix',
939 'uploader_id': 'FlixMatrixKaravan',
940 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
941 'license': 'Standard YouTube License',
942 },
943 'params': {
944 'skip_download': True,
945 },
946 'skip': 'This video is not available.',
947 },
948 {
949 # YouTube Red video with episode data
950 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
951 'info_dict': {
952 'id': 'iqKdEhx-dD4',
953 'ext': 'mp4',
954 'title': 'Isolation - Mind Field (Ep 1)',
955 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
956 'duration': 2085,
957 'upload_date': '20170118',
958 'uploader': 'Vsauce',
959 'uploader_id': 'Vsauce',
960 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
961 'series': 'Mind Field',
962 'season_number': 1,
963 'episode_number': 1,
964 },
965 'params': {
966 'skip_download': True,
967 },
968 'expected_warnings': [
969 'Skipping DASH manifest',
970 ],
971 },
972 {
973 # The following content has been identified by the YouTube community
974 # as inappropriate or offensive to some audiences.
975 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
976 'info_dict': {
977 'id': '6SJNVb0GnPI',
978 'ext': 'mp4',
979 'title': 'Race Differences in Intelligence',
980 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
981 'duration': 965,
982 'upload_date': '20140124',
983 'uploader': 'New Century Foundation',
984 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
985 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
986 },
987 'params': {
988 'skip_download': True,
989 },
990 },
991 {
992 # itag 212
993 'url': '1t24XAntNCY',
994 'only_matching': True,
995 },
996 {
997 # geo restricted to JP
998 'url': 'sJL6WA-aGkQ',
999 'only_matching': True,
1000 },
1001 {
1002 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1003 'only_matching': True,
1004 },
1005 {
1006 # DRM protected
1007 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1008 'only_matching': True,
1009 },
1010 {
1011 # Video with unsupported adaptive stream type formats
1012 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1013 'info_dict': {
1014 'id': 'Z4Vy8R84T1U',
1015 'ext': 'mp4',
1016 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1017 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1018 'duration': 433,
1019 'upload_date': '20130923',
1020 'uploader': 'Amelia Putri Harwita',
1021 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1022 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1023 'formats': 'maxcount:10',
1024 },
1025 'params': {
1026 'skip_download': True,
1027 'youtube_include_dash_manifest': False,
1028 },
1029 'skip': 'not actual anymore',
1030 },
1031 {
1032 # Youtube Music Auto-generated description
1033 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1034 'info_dict': {
1035 'id': 'MgNrAu2pzNs',
1036 'ext': 'mp4',
1037 'title': 'Voyeur Girl',
1038 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1039 'upload_date': '20190312',
1040 'uploader': 'Stephen - Topic',
1041 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
1042 'artist': 'Stephen',
1043 'track': 'Voyeur Girl',
1044 'album': 'it\'s too much love to know my dear',
1045 'release_date': '20190313',
1046 'release_year': 2019,
1047 },
1048 'params': {
1049 'skip_download': True,
1050 },
1051 },
1052 {
1053 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1054 'only_matching': True,
1055 },
1056 {
1057 # invalid -> valid video id redirection
1058 'url': 'DJztXj2GPfl',
1059 'info_dict': {
1060 'id': 'DJztXj2GPfk',
1061 'ext': 'mp4',
1062 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1063 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1064 'upload_date': '20090125',
1065 'uploader': 'Prochorowka',
1066 'uploader_id': 'Prochorowka',
1067 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1068 'artist': 'Panjabi MC',
1069 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1070 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1071 },
1072 'params': {
1073 'skip_download': True,
1074 },
1075 },
1076 {
1077 # empty description results in an empty string
1078 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1079 'info_dict': {
1080 'id': 'x41yOUIvK2k',
1081 'ext': 'mp4',
1082 'title': 'IMG 3456',
1083 'description': '',
1084 'upload_date': '20170613',
1085 'uploader_id': 'ElevageOrVert',
1086 'uploader': 'ElevageOrVert',
1087 },
1088 'params': {
1089 'skip_download': True,
1090 },
1091 },
1092 ]
1093
1094 def __init__(self, *args, **kwargs):
1095 super(YoutubeIE, self).__init__(*args, **kwargs)
1096 self._player_cache = {}
1097
1098 def report_video_info_webpage_download(self, video_id):
1099 """Report attempt to download video info webpage."""
1100 self.to_screen('%s: Downloading video info webpage' % video_id)
1101
1102 def report_information_extraction(self, video_id):
1103 """Report attempt to extract video information."""
1104 self.to_screen('%s: Extracting video information' % video_id)
1105
1106 def report_unavailable_format(self, video_id, format):
1107 """Report extracted video URL."""
1108 self.to_screen('%s: Format %s not available' % (video_id, format))
1109
1110 def report_rtmp_download(self):
1111 """Indicate the download will use the RTMP protocol."""
1112 self.to_screen('RTMP download detected')
1113
1114 def _signature_cache_id(self, example_sig):
1115 """ Return a string representation of a signature """
1116 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
1117
1118 @classmethod
1119 def _extract_player_info(cls, player_url):
1120 for player_re in cls._PLAYER_INFO_RE:
1121 id_m = re.search(player_re, player_url)
1122 if id_m:
1123 break
1124 else:
1125 raise ExtractorError('Cannot identify player %r' % player_url)
1126 return id_m.group('ext'), id_m.group('id')
1127
1128 def _extract_signature_function(self, video_id, player_url, example_sig):
1129 player_type, player_id = self._extract_player_info(player_url)
1130
1131 # Read from filesystem cache
1132 func_id = '%s_%s_%s' % (
1133 player_type, player_id, self._signature_cache_id(example_sig))
1134 assert os.path.basename(func_id) == func_id
1135
1136 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
1137 if cache_spec is not None:
1138 return lambda s: ''.join(s[i] for i in cache_spec)
1139
1140 download_note = (
1141 'Downloading player %s' % player_url
1142 if self._downloader.params.get('verbose') else
1143 'Downloading %s player %s' % (player_type, player_id)
1144 )
1145 if player_type == 'js':
1146 code = self._download_webpage(
1147 player_url, video_id,
1148 note=download_note,
1149 errnote='Download of %s failed' % player_url)
1150 res = self._parse_sig_js(code)
1151 elif player_type == 'swf':
1152 urlh = self._request_webpage(
1153 player_url, video_id,
1154 note=download_note,
1155 errnote='Download of %s failed' % player_url)
1156 code = urlh.read()
1157 res = self._parse_sig_swf(code)
1158 else:
1159 assert False, 'Invalid player type %r' % player_type
1160
1161 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1162 cache_res = res(test_string)
1163 cache_spec = [ord(c) for c in cache_res]
1164
1165 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
1166 return res
1167
1168 def _print_sig_code(self, func, example_sig):
1169 def gen_sig_code(idxs):
1170 def _genslice(start, end, step):
1171 starts = '' if start == 0 else str(start)
1172 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
1173 steps = '' if step == 1 else (':%d' % step)
1174 return 's[%s%s%s]' % (starts, ends, steps)
1175
1176 step = None
1177 # Quelch pyflakes warnings - start will be set when step is set
1178 start = '(Never used)'
1179 for i, prev in zip(idxs[1:], idxs[:-1]):
1180 if step is not None:
1181 if i - prev == step:
1182 continue
1183 yield _genslice(start, prev, step)
1184 step = None
1185 continue
1186 if i - prev in [-1, 1]:
1187 step = i - prev
1188 start = prev
1189 continue
1190 else:
1191 yield 's[%d]' % prev
1192 if step is None:
1193 yield 's[%d]' % i
1194 else:
1195 yield _genslice(start, i, step)
1196
1197 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1198 cache_res = func(test_string)
1199 cache_spec = [ord(c) for c in cache_res]
1200 expr_code = ' + '.join(gen_sig_code(cache_spec))
1201 signature_id_tuple = '(%s)' % (
1202 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
1203 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
1204 ' return %s\n') % (signature_id_tuple, expr_code)
1205 self.to_screen('Extracted signature function:\n' + code)
1206
1207 def _parse_sig_js(self, jscode):
1208 funcname = self._search_regex(
1209 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1210 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1211 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1212 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1213 # Obsolete patterns
1214 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1215 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
1216 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1217 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1218 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1219 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1220 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1221 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
1222 jscode, 'Initial JS player signature function name', group='sig')
1223
1224 jsi = JSInterpreter(jscode)
1225 initial_function = jsi.extract_function(funcname)
1226 return lambda s: initial_function([s])
1227
1228 def _parse_sig_swf(self, file_contents):
1229 swfi = SWFInterpreter(file_contents)
1230 TARGET_CLASSNAME = 'SignatureDecipher'
1231 searched_class = swfi.extract_class(TARGET_CLASSNAME)
1232 initial_function = swfi.extract_function(searched_class, 'decipher')
1233 return lambda s: initial_function([s])
1234
1235 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1236 """Turn the encrypted s field into a working signature"""
1237
1238 if player_url is None:
1239 raise ExtractorError('Cannot decrypt signature without player_url')
1240
1241 if player_url.startswith('//'):
1242 player_url = 'https:' + player_url
1243 elif not re.match(r'https?://', player_url):
1244 player_url = compat_urlparse.urljoin(
1245 'https://www.youtube.com', player_url)
1246 try:
1247 player_id = (player_url, self._signature_cache_id(s))
1248 if player_id not in self._player_cache:
1249 func = self._extract_signature_function(
1250 video_id, player_url, s
1251 )
1252 self._player_cache[player_id] = func
1253 func = self._player_cache[player_id]
1254 if self._downloader.params.get('youtube_print_sig_code'):
1255 self._print_sig_code(func, s)
1256 return func(s)
1257 except Exception as e:
1258 tb = traceback.format_exc()
1259 raise ExtractorError(
1260 'Signature extraction failed: ' + tb, cause=e)
1261
1262 def _get_subtitles(self, video_id, webpage, has_live_chat_replay):
1263 try:
1264 subs_doc = self._download_xml(
1265 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1266 video_id, note=False)
1267 except ExtractorError as err:
1268 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
1269 return {}
1270
1271 sub_lang_list = {}
1272 for track in subs_doc.findall('track'):
1273 lang = track.attrib['lang_code']
1274 if lang in sub_lang_list:
1275 continue
1276 sub_formats = []
1277 for ext in self._SUBTITLE_FORMATS:
1278 params = compat_urllib_parse_urlencode({
1279 'lang': lang,
1280 'v': video_id,
1281 'fmt': ext,
1282 'name': track.attrib['name'].encode('utf-8'),
1283 })
1284 sub_formats.append({
1285 'url': 'https://www.youtube.com/api/timedtext?' + params,
1286 'ext': ext,
1287 })
1288 sub_lang_list[lang] = sub_formats
1289 if has_live_chat_replay:
1290 sub_lang_list['live_chat'] = [
1291 {
1292 'video_id': video_id,
1293 'ext': 'json',
1294 'protocol': 'youtube_live_chat_replay',
1295 },
1296 ]
1297 if not sub_lang_list:
1298 self._downloader.report_warning('video doesn\'t have subtitles')
1299 return {}
1300 return sub_lang_list
1301
1302 def _get_ytplayer_config(self, video_id, webpage):
1303 patterns = (
1304 # User data may contain arbitrary character sequences that may affect
1305 # JSON extraction with regex, e.g. when '};' is contained the second
1306 # regex won't capture the whole JSON. Yet working around by trying more
1307 # concrete regex first keeping in mind proper quoted string handling
1308 # to be implemented in future that will replace this workaround (see
1309 # https://github.com/ytdl-org/youtube-dl/issues/7468,
1310 # https://github.com/ytdl-org/youtube-dl/pull/7599)
1311 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1312 r';ytplayer\.config\s*=\s*({.+?});',
1313 r'ytInitialPlayerResponse\s*=\s*({.+?});var meta' # Needed???
1314 )
1315 config = self._search_regex(
1316 patterns, webpage, 'ytplayer.config', default=None)
1317 if config:
1318 return self._parse_json(
1319 uppercase_escape(config), video_id, fatal=False)
1320
1321 def _get_music_metadata_from_yt_initial(self, yt_initial):
1322 music_metadata = []
1323 key_map = {
1324 'Album': 'album',
1325 'Artist': 'artist',
1326 'Song': 'track'
1327 }
1328 contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'])
1329 if type(contents) is list:
1330 for content in contents:
1331 music_track = {}
1332 if type(content) is not dict:
1333 continue
1334 videoSecondaryInfoRenderer = try_get(content, lambda x: x['videoSecondaryInfoRenderer'])
1335 if type(videoSecondaryInfoRenderer) is not dict:
1336 continue
1337 rows = try_get(videoSecondaryInfoRenderer, lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'])
1338 if type(rows) is not list:
1339 continue
1340 for row in rows:
1341 metadataRowRenderer = try_get(row, lambda x: x['metadataRowRenderer'])
1342 if type(metadataRowRenderer) is not dict:
1343 continue
1344 key = try_get(metadataRowRenderer, lambda x: x['title']['simpleText'])
1345 value = try_get(metadataRowRenderer, lambda x: x['contents'][0]['simpleText']) or \
1346 try_get(metadataRowRenderer, lambda x: x['contents'][0]['runs'][0]['text'])
1347 if type(key) is not str or type(value) is not str:
1348 continue
1349 if key in key_map:
1350 if key_map[key] in music_track:
1351 # we've started on a new track
1352 music_metadata.append(music_track)
1353 music_track = {}
1354 music_track[key_map[key]] = value
1355 if len(music_track.keys()):
1356 music_metadata.append(music_track)
1357 return music_metadata
1358
1359 def _get_automatic_captions(self, video_id, webpage):
1360 """We need the webpage for getting the captions url, pass it as an
1361 argument to speed up the process."""
1362 self.to_screen('%s: Looking for automatic captions' % video_id)
1363 player_config = self._get_ytplayer_config(video_id, webpage)
1364 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
1365 if not player_config:
1366 self._downloader.report_warning(err_msg)
1367 return {}
1368 try:
1369 args = player_config['args']
1370 caption_url = args.get('ttsurl')
1371 if caption_url:
1372 timestamp = args['timestamp']
1373 # We get the available subtitles
1374 list_params = compat_urllib_parse_urlencode({
1375 'type': 'list',
1376 'tlangs': 1,
1377 'asrs': 1,
1378 })
1379 list_url = caption_url + '&' + list_params
1380 caption_list = self._download_xml(list_url, video_id)
1381 original_lang_node = caption_list.find('track')
1382 if original_lang_node is None:
1383 self._downloader.report_warning('Video doesn\'t have automatic captions')
1384 return {}
1385 original_lang = original_lang_node.attrib['lang_code']
1386 caption_kind = original_lang_node.attrib.get('kind', '')
1387
1388 sub_lang_list = {}
1389 for lang_node in caption_list.findall('target'):
1390 sub_lang = lang_node.attrib['lang_code']
1391 sub_formats = []
1392 for ext in self._SUBTITLE_FORMATS:
1393 params = compat_urllib_parse_urlencode({
1394 'lang': original_lang,
1395 'tlang': sub_lang,
1396 'fmt': ext,
1397 'ts': timestamp,
1398 'kind': caption_kind,
1399 })
1400 sub_formats.append({
1401 'url': caption_url + '&' + params,
1402 'ext': ext,
1403 })
1404 sub_lang_list[sub_lang] = sub_formats
1405 return sub_lang_list
1406
1407 def make_captions(sub_url, sub_langs):
1408 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1409 caption_qs = compat_parse_qs(parsed_sub_url.query)
1410 captions = {}
1411 for sub_lang in sub_langs:
1412 sub_formats = []
1413 for ext in self._SUBTITLE_FORMATS:
1414 caption_qs.update({
1415 'tlang': [sub_lang],
1416 'fmt': [ext],
1417 })
1418 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1419 query=compat_urllib_parse_urlencode(caption_qs, True)))
1420 sub_formats.append({
1421 'url': sub_url,
1422 'ext': ext,
1423 })
1424 captions[sub_lang] = sub_formats
1425 return captions
1426
1427 # New captions format as of 22.06.2017
1428 player_response = args.get('player_response')
1429 if player_response and isinstance(player_response, compat_str):
1430 player_response = self._parse_json(
1431 player_response, video_id, fatal=False)
1432 if player_response:
1433 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1434 base_url = renderer['captionTracks'][0]['baseUrl']
1435 sub_lang_list = []
1436 for lang in renderer['translationLanguages']:
1437 lang_code = lang.get('languageCode')
1438 if lang_code:
1439 sub_lang_list.append(lang_code)
1440 return make_captions(base_url, sub_lang_list)
1441
1442 # Some videos don't provide ttsurl but rather caption_tracks and
1443 # caption_translation_languages (e.g. 20LmZk1hakA)
1444 # Does not used anymore as of 22.06.2017
1445 caption_tracks = args['caption_tracks']
1446 caption_translation_languages = args['caption_translation_languages']
1447 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
1448 sub_lang_list = []
1449 for lang in caption_translation_languages.split(','):
1450 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1451 sub_lang = lang_qs.get('lc', [None])[0]
1452 if sub_lang:
1453 sub_lang_list.append(sub_lang)
1454 return make_captions(caption_url, sub_lang_list)
1455 # An extractor error can be raise by the download process if there are
1456 # no automatic captions but there are subtitles
1457 except (KeyError, IndexError, ExtractorError):
1458 self._downloader.report_warning(err_msg)
1459 return {}
1460
1461 def _mark_watched(self, video_id, video_info, player_response):
1462 playback_url = url_or_none(try_get(
1463 player_response,
1464 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
1465 video_info, lambda x: x['videostats_playback_base_url'][0]))
1466 if not playback_url:
1467 return
1468 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1469 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1470
1471 # cpn generation algorithm is reverse engineered from base.js.
1472 # In fact it works even with dummy cpn.
1473 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1474 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1475
1476 qs.update({
1477 'ver': ['2'],
1478 'cpn': [cpn],
1479 })
1480 playback_url = compat_urlparse.urlunparse(
1481 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
1482
1483 self._download_webpage(
1484 playback_url, video_id, 'Marking watched',
1485 'Unable to mark watched', fatal=False)
1486
1487 @staticmethod
1488 def _extract_urls(webpage):
1489 # Embedded YouTube player
1490 entries = [
1491 unescapeHTML(mobj.group('url'))
1492 for mobj in re.finditer(r'''(?x)
1493 (?:
1494 <iframe[^>]+?src=|
1495 data-video-url=|
1496 <embed[^>]+?src=|
1497 embedSWF\(?:\s*|
1498 <object[^>]+data=|
1499 new\s+SWFObject\(
1500 )
1501 (["\'])
1502 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1503 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
1504 \1''', webpage)]
1505
1506 # lazyYT YouTube embed
1507 entries.extend(list(map(
1508 unescapeHTML,
1509 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1510
1511 # Wordpress "YouTube Video Importer" plugin
1512 matches = re.findall(r'''(?x)<div[^>]+
1513 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1514 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1515 entries.extend(m[-1] for m in matches)
1516
1517 return entries
1518
1519 @staticmethod
1520 def _extract_url(webpage):
1521 urls = YoutubeIE._extract_urls(webpage)
1522 return urls[0] if urls else None
1523
1524 @classmethod
1525 def extract_id(cls, url):
1526 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1527 if mobj is None:
1528 raise ExtractorError('Invalid URL: %s' % url)
1529 video_id = mobj.group(2)
1530 return video_id
1531
1532 def _extract_chapters_from_json(self, webpage, video_id, duration):
1533 if not webpage:
1534 return
1535 data = self._extract_yt_initial_data(video_id, webpage)
1536 if not data or not isinstance(data, dict):
1537 return
1538 chapters_list = try_get(
1539 data,
1540 lambda x: x['playerOverlays']
1541 ['playerOverlayRenderer']
1542 ['decoratedPlayerBarRenderer']
1543 ['decoratedPlayerBarRenderer']
1544 ['playerBar']
1545 ['chapteredPlayerBarRenderer']
1546 ['chapters'],
1547 list)
1548 if not chapters_list:
1549 return
1550
1551 def chapter_time(chapter):
1552 return float_or_none(
1553 try_get(
1554 chapter,
1555 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1556 int),
1557 scale=1000)
1558 chapters = []
1559 for next_num, chapter in enumerate(chapters_list, start=1):
1560 start_time = chapter_time(chapter)
1561 if start_time is None:
1562 continue
1563 end_time = (chapter_time(chapters_list[next_num])
1564 if next_num < len(chapters_list) else duration)
1565 if end_time is None:
1566 continue
1567 title = try_get(
1568 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1569 compat_str)
1570 chapters.append({
1571 'start_time': start_time,
1572 'end_time': end_time,
1573 'title': title,
1574 })
1575 return chapters
1576
1577 @staticmethod
1578 def _extract_chapters_from_description(description, duration):
1579 if not description:
1580 return None
1581 chapter_lines = re.findall(
1582 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1583 description)
1584 if not chapter_lines:
1585 return None
1586 chapters = []
1587 for next_num, (chapter_line, time_point) in enumerate(
1588 chapter_lines, start=1):
1589 start_time = parse_duration(time_point)
1590 if start_time is None:
1591 continue
1592 if start_time > duration:
1593 break
1594 end_time = (duration if next_num == len(chapter_lines)
1595 else parse_duration(chapter_lines[next_num][1]))
1596 if end_time is None:
1597 continue
1598 if end_time > duration:
1599 end_time = duration
1600 if start_time > end_time:
1601 break
1602 chapter_title = re.sub(
1603 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1604 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1605 chapters.append({
1606 'start_time': start_time,
1607 'end_time': end_time,
1608 'title': chapter_title,
1609 })
1610 return chapters
1611
1612 def _extract_chapters(self, webpage, description, video_id, duration):
1613 return (self._extract_chapters_from_json(webpage, video_id, duration)
1614 or self._extract_chapters_from_description(description, duration))
1615
1616 def _real_extract(self, url):
1617 url, smuggled_data = unsmuggle_url(url, {})
1618
1619 proto = (
1620 'http' if self._downloader.params.get('prefer_insecure', False)
1621 else 'https')
1622
1623 start_time = None
1624 end_time = None
1625 parsed_url = compat_urllib_parse_urlparse(url)
1626 for component in [parsed_url.fragment, parsed_url.query]:
1627 query = compat_parse_qs(component)
1628 if start_time is None and 't' in query:
1629 start_time = parse_duration(query['t'][0])
1630 if start_time is None and 'start' in query:
1631 start_time = parse_duration(query['start'][0])
1632 if end_time is None and 'end' in query:
1633 end_time = parse_duration(query['end'][0])
1634
1635 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1636 mobj = re.search(self._NEXT_URL_RE, url)
1637 if mobj:
1638 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
1639 video_id = self.extract_id(url)
1640
1641 # Get video webpage
1642 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
1643 video_webpage, urlh = self._download_webpage_handle(url, video_id)
1644
1645 qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query)
1646 video_id = qs.get('v', [None])[0] or video_id
1647
1648 # Attempt to extract SWF player URL
1649 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1650 if mobj is not None:
1651 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1652 else:
1653 player_url = None
1654
1655 dash_mpds = []
1656
1657 def add_dash_mpd(video_info):
1658 dash_mpd = video_info.get('dashmpd')
1659 if dash_mpd and dash_mpd[0] not in dash_mpds:
1660 dash_mpds.append(dash_mpd[0])
1661
1662 def add_dash_mpd_pr(pl_response):
1663 dash_mpd = url_or_none(try_get(
1664 pl_response, lambda x: x['streamingData']['dashManifestUrl'],
1665 compat_str))
1666 if dash_mpd and dash_mpd not in dash_mpds:
1667 dash_mpds.append(dash_mpd)
1668
1669 is_live = None
1670 view_count = None
1671
1672 def extract_view_count(v_info):
1673 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1674
1675 def extract_player_response(player_response, video_id):
1676 pl_response = str_or_none(player_response)
1677 if not pl_response:
1678 return
1679 pl_response = self._parse_json(pl_response, video_id, fatal=False)
1680 if isinstance(pl_response, dict):
1681 add_dash_mpd_pr(pl_response)
1682 return pl_response
1683
1684 def extract_embedded_config(embed_webpage, video_id):
1685 embedded_config = self._search_regex(
1686 r'setConfig\(({.*})\);',
1687 embed_webpage, 'ytInitialData', default=None)
1688 if embedded_config:
1689 return embedded_config
1690
1691 player_response = {}
1692
1693 # Get video info
1694 video_info = {}
1695 embed_webpage = None
1696 if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+'
1697 or re.search(r'player-age-gate-content">', video_webpage) is not None):
1698 cookie_keys = self._get_cookies('https://www.youtube.com').keys()
1699 age_gate = True
1700 # We simulate the access to the video from www.youtube.com/v/{video_id}
1701 # this can be viewed without login into Youtube
1702 url = proto + '://www.youtube.com/embed/%s' % video_id
1703 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
1704 ext = extract_embedded_config(embed_webpage, video_id)
1705 # playabilityStatus = re.search(r'{\\\"status\\\":\\\"(?P<playabilityStatus>[^\"]+)\\\"', ext)
1706 playable_in_embed = re.search(r'{\\\"playableInEmbed\\\":(?P<playableinEmbed>[^\,]+)', ext)
1707 if not playable_in_embed:
1708 self.to_screen('Could not determine whether playabale in embed for video %s' % video_id)
1709 playable_in_embed = ''
1710 else:
1711 playable_in_embed = playable_in_embed.group('playableinEmbed')
1712 # check if video is only playable on youtube in other words not playable in embed - if so it requires auth (cookies)
1713 # if re.search(r'player-unavailable">', embed_webpage) is not None:
1714 if playable_in_embed == 'false':
1715 '''
1716 # TODO apply this patch when Support for Python 2.6(!) and above drops
1717 if ({'VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID'} <= cookie_keys
1718 or {'VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO'} <= cookie_keys):
1719 '''
1720 if (set(('VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID')) <= set(cookie_keys)
1721 or set(('VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO')) <= set(cookie_keys)):
1722 age_gate = False
1723 # Try looking directly into the video webpage
1724 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1725 if ytplayer_config:
1726 args = ytplayer_config.get("args")
1727 if args is not None:
1728 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1729 # Convert to the same format returned by compat_parse_qs
1730 video_info = dict((k, [v]) for k, v in args.items())
1731 add_dash_mpd(video_info)
1732 # Rental video is not rented but preview is available (e.g.
1733 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1734 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1735 if not video_info and args.get('ypc_vid'):
1736 return self.url_result(
1737 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1738 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1739 is_live = True
1740 if not player_response:
1741 player_response = extract_player_response(args.get('player_response'), video_id)
1742 elif not player_response:
1743 player_response = ytplayer_config
1744 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1745 add_dash_mpd_pr(player_response)
1746 else:
1747 raise ExtractorError('Video is age restricted and only playable on Youtube. Requires cookies!', expected=True)
1748 else:
1749 data = compat_urllib_parse_urlencode({
1750 'video_id': video_id,
1751 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1752 'sts': self._search_regex(
1753 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1754 })
1755 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1756 try:
1757 video_info_webpage = self._download_webpage(
1758 video_info_url, video_id,
1759 note='Refetching age-gated info webpage',
1760 errnote='unable to download video info webpage')
1761 except ExtractorError:
1762 video_info_webpage = None
1763 if video_info_webpage:
1764 video_info = compat_parse_qs(video_info_webpage)
1765 pl_response = video_info.get('player_response', [None])[0]
1766 player_response = extract_player_response(pl_response, video_id)
1767 add_dash_mpd(video_info)
1768 view_count = extract_view_count(video_info)
1769 else:
1770 age_gate = False
1771 # Try looking directly into the video webpage
1772 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1773 if ytplayer_config:
1774 args = ytplayer_config.get('args', {})
1775 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1776 # Convert to the same format returned by compat_parse_qs
1777 video_info = dict((k, [v]) for k, v in args.items())
1778 add_dash_mpd(video_info)
1779 # Rental video is not rented but preview is available (e.g.
1780 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1781 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1782 if not video_info and args.get('ypc_vid'):
1783 return self.url_result(
1784 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1785 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1786 is_live = True
1787 if not player_response:
1788 player_response = extract_player_response(args.get('player_response'), video_id)
1789 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1790 add_dash_mpd_pr(player_response)
1791
1792 if not video_info and not player_response:
1793 player_response = extract_player_response(
1794 self._search_regex(
1795 r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;', video_webpage,
1796 'initial player response', default='{}'),
1797 video_id)
1798
1799 def extract_unavailable_message():
1800 messages = []
1801 for tag, kind in (('h1', 'message'), ('div', 'submessage')):
1802 msg = self._html_search_regex(
1803 r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
1804 video_webpage, 'unavailable %s' % kind, default=None)
1805 if msg:
1806 messages.append(msg)
1807 if messages:
1808 return '\n'.join(messages)
1809
1810 if not video_info and not player_response:
1811 unavailable_message = extract_unavailable_message()
1812 if not unavailable_message:
1813 unavailable_message = 'Unable to extract video data'
1814 raise ExtractorError(
1815 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
1816
1817 if not isinstance(video_info, dict):
1818 video_info = {}
1819
1820 video_details = try_get(
1821 player_response, lambda x: x['videoDetails'], dict) or {}
1822
1823 microformat = try_get(
1824 player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}
1825
1826 video_title = video_info.get('title', [None])[0] or video_details.get('title')
1827 if not video_title:
1828 self._downloader.report_warning('Unable to extract video title')
1829 video_title = '_'
1830
1831 description_original = video_description = get_element_by_id("eow-description", video_webpage)
1832 if video_description:
1833
1834 def replace_url(m):
1835 redir_url = compat_urlparse.urljoin(url, m.group(1))
1836 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1837 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1838 qs = compat_parse_qs(parsed_redir_url.query)
1839 q = qs.get('q')
1840 if q and q[0]:
1841 return q[0]
1842 return redir_url
1843
1844 description_original = video_description = re.sub(r'''(?x)
1845 <a\s+
1846 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1847 (?:title|href)="([^"]+)"\s+
1848 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1849 class="[^"]*"[^>]*>
1850 [^<]+\.{3}\s*
1851 </a>
1852 ''', replace_url, video_description)
1853 video_description = clean_html(video_description)
1854 else:
1855 video_description = video_details.get('shortDescription')
1856 if video_description is None:
1857 video_description = self._html_search_meta('description', video_webpage)
1858
1859 if not smuggled_data.get('force_singlefeed', False):
1860 if not self._downloader.params.get('noplaylist'):
1861 multifeed_metadata_list = try_get(
1862 player_response,
1863 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
1864 compat_str) or try_get(
1865 video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
1866 if multifeed_metadata_list:
1867 entries = []
1868 feed_ids = []
1869 for feed in multifeed_metadata_list.split(','):
1870 # Unquote should take place before split on comma (,) since textual
1871 # fields may contain comma as well (see
1872 # https://github.com/ytdl-org/youtube-dl/issues/8536)
1873 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
1874
1875 def feed_entry(name):
1876 return try_get(feed_data, lambda x: x[name][0], compat_str)
1877
1878 feed_id = feed_entry('id')
1879 if not feed_id:
1880 continue
1881 feed_title = feed_entry('title')
1882 title = video_title
1883 if feed_title:
1884 title += ' (%s)' % feed_title
1885 entries.append({
1886 '_type': 'url_transparent',
1887 'ie_key': 'Youtube',
1888 'url': smuggle_url(
1889 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1890 {'force_singlefeed': True}),
1891 'title': title,
1892 })
1893 feed_ids.append(feed_id)
1894 self.to_screen(
1895 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1896 % (', '.join(feed_ids), video_id))
1897 return self.playlist_result(entries, video_id, video_title, video_description)
1898 else:
1899 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1900
1901 if view_count is None:
1902 view_count = extract_view_count(video_info)
1903 if view_count is None and video_details:
1904 view_count = int_or_none(video_details.get('viewCount'))
1905 if view_count is None and microformat:
1906 view_count = int_or_none(microformat.get('viewCount'))
1907
1908 if is_live is None:
1909 is_live = bool_or_none(video_details.get('isLive'))
1910
1911 has_live_chat_replay = False
1912 if not is_live:
1913 yt_initial_data = self._get_yt_initial_data(video_id, video_webpage)
1914 try:
1915 yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
1916 has_live_chat_replay = True
1917 except (KeyError, IndexError, TypeError):
1918 pass
1919
1920 # Check for "rental" videos
1921 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1922 raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
1923
1924 def _extract_filesize(media_url):
1925 return int_or_none(self._search_regex(
1926 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
1927
1928 streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
1929 streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
1930
1931 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1932 self.report_rtmp_download()
1933 formats = [{
1934 'format_id': '_rtmp',
1935 'protocol': 'rtmp',
1936 'url': video_info['conn'][0],
1937 'player_url': player_url,
1938 }]
1939 elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
1940 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
1941 if 'rtmpe%3Dyes' in encoded_url_map:
1942 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
1943 formats = []
1944 formats_spec = {}
1945 fmt_list = video_info.get('fmt_list', [''])[0]
1946 if fmt_list:
1947 for fmt in fmt_list.split(','):
1948 spec = fmt.split('/')
1949 if len(spec) > 1:
1950 width_height = spec[1].split('x')
1951 if len(width_height) == 2:
1952 formats_spec[spec[0]] = {
1953 'resolution': spec[1],
1954 'width': int_or_none(width_height[0]),
1955 'height': int_or_none(width_height[1]),
1956 }
1957 for fmt in streaming_formats:
1958 itag = str_or_none(fmt.get('itag'))
1959 if not itag:
1960 continue
1961 quality = fmt.get('quality')
1962 quality_label = fmt.get('qualityLabel') or quality
1963 formats_spec[itag] = {
1964 'asr': int_or_none(fmt.get('audioSampleRate')),
1965 'filesize': int_or_none(fmt.get('contentLength')),
1966 'format_note': quality_label,
1967 'fps': int_or_none(fmt.get('fps')),
1968 'height': int_or_none(fmt.get('height')),
1969 # bitrate for itag 43 is always 2147483647
1970 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
1971 'width': int_or_none(fmt.get('width')),
1972 }
1973
1974 for fmt in streaming_formats:
1975 if fmt.get('drmFamilies') or fmt.get('drm_families'):
1976 continue
1977 url = url_or_none(fmt.get('url'))
1978
1979 if not url:
1980 cipher = fmt.get('cipher') or fmt.get('signatureCipher')
1981 if not cipher:
1982 continue
1983 url_data = compat_parse_qs(cipher)
1984 url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
1985 if not url:
1986 continue
1987 else:
1988 cipher = None
1989 url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
1990
1991 stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
1992 # Unsupported FORMAT_STREAM_TYPE_OTF
1993 if stream_type == 3:
1994 continue
1995
1996 format_id = fmt.get('itag') or url_data['itag'][0]
1997 if not format_id:
1998 continue
1999 format_id = compat_str(format_id)
2000
2001 if cipher:
2002 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
2003 ASSETS_RE = (
2004 r'<script[^>]+\bsrc=("[^"]+")[^>]+\bname=["\']player_ias/base',
2005 r'"jsUrl"\s*:\s*("[^"]+")',
2006 r'"assets":.+?"js":\s*("[^"]+")')
2007 jsplayer_url_json = self._search_regex(
2008 ASSETS_RE,
2009 embed_webpage if age_gate else video_webpage,
2010 'JS player URL (1)', default=None)
2011 if not jsplayer_url_json and not age_gate:
2012 # We need the embed website after all
2013 if embed_webpage is None:
2014 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
2015 embed_webpage = self._download_webpage(
2016 embed_url, video_id, 'Downloading embed webpage')
2017 jsplayer_url_json = self._search_regex(
2018 ASSETS_RE, embed_webpage, 'JS player URL')
2019
2020 player_url = json.loads(jsplayer_url_json)
2021 if player_url is None:
2022 player_url_json = self._search_regex(
2023 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
2024 video_webpage, 'age gate player URL')
2025 player_url = json.loads(player_url_json)
2026
2027 if 'sig' in url_data:
2028 url += '&signature=' + url_data['sig'][0]
2029 elif 's' in url_data:
2030 encrypted_sig = url_data['s'][0]
2031
2032 if self._downloader.params.get('verbose'):
2033 if player_url is None:
2034 player_desc = 'unknown'
2035 else:
2036 player_type, player_version = self._extract_player_info(player_url)
2037 player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version)
2038 parts_sizes = self._signature_cache_id(encrypted_sig)
2039 self.to_screen('{%s} signature length %s, %s' %
2040 (format_id, parts_sizes, player_desc))
2041
2042 signature = self._decrypt_signature(
2043 encrypted_sig, video_id, player_url, age_gate)
2044 sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
2045 url += '&%s=%s' % (sp, signature)
2046 if 'ratebypass' not in url:
2047 url += '&ratebypass=yes'
2048
2049 dct = {
2050 'format_id': format_id,
2051 'url': url,
2052 'player_url': player_url,
2053 }
2054 if format_id in self._formats:
2055 dct.update(self._formats[format_id])
2056 if format_id in formats_spec:
2057 dct.update(formats_spec[format_id])
2058
2059 # Some itags are not included in DASH manifest thus corresponding formats will
2060 # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).
2061 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
2062 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
2063 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
2064
2065 if width is None:
2066 width = int_or_none(fmt.get('width'))
2067 if height is None:
2068 height = int_or_none(fmt.get('height'))
2069
2070 filesize = int_or_none(url_data.get(
2071 'clen', [None])[0]) or _extract_filesize(url)
2072
2073 quality = url_data.get('quality', [None])[0] or fmt.get('quality')
2074 quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
2075
2076 tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
2077 or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
2078 fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
2079
2080 more_fields = {
2081 'filesize': filesize,
2082 'tbr': tbr,
2083 'width': width,
2084 'height': height,
2085 'fps': fps,
2086 'format_note': quality_label or quality,
2087 }
2088 for key, value in more_fields.items():
2089 if value:
2090 dct[key] = value
2091 type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
2092 if type_:
2093 type_split = type_.split(';')
2094 kind_ext = type_split[0].split('/')
2095 if len(kind_ext) == 2:
2096 kind, _ = kind_ext
2097 dct['ext'] = mimetype2ext(type_split[0])
2098 if kind in ('audio', 'video'):
2099 codecs = None
2100 for mobj in re.finditer(
2101 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
2102 if mobj.group('key') == 'codecs':
2103 codecs = mobj.group('val')
2104 break
2105 if codecs:
2106 dct.update(parse_codecs(codecs))
2107 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
2108 dct['downloader_options'] = {
2109 # Youtube throttles chunks >~10M
2110 'http_chunk_size': 10485760,
2111 }
2112 formats.append(dct)
2113 else:
2114 manifest_url = (
2115 url_or_none(try_get(
2116 player_response,
2117 lambda x: x['streamingData']['hlsManifestUrl'],
2118 compat_str))
2119 or url_or_none(try_get(
2120 video_info, lambda x: x['hlsvp'][0], compat_str)))
2121 if manifest_url:
2122 formats = []
2123 m3u8_formats = self._extract_m3u8_formats(
2124 manifest_url, video_id, 'mp4', fatal=False)
2125 for a_format in m3u8_formats:
2126 itag = self._search_regex(
2127 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
2128 if itag:
2129 a_format['format_id'] = itag
2130 if itag in self._formats:
2131 dct = self._formats[itag].copy()
2132 dct.update(a_format)
2133 a_format = dct
2134 a_format['player_url'] = player_url
2135 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
2136 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
2137 if self._downloader.params.get('youtube_include_hls_manifest', True):
2138 formats.append(a_format)
2139 else:
2140 error_message = extract_unavailable_message()
2141 if not error_message:
2142 error_message = clean_html(try_get(
2143 player_response, lambda x: x['playabilityStatus']['reason'],
2144 compat_str))
2145 if not error_message:
2146 error_message = clean_html(
2147 try_get(video_info, lambda x: x['reason'][0], compat_str))
2148 if error_message:
2149 raise ExtractorError(error_message, expected=True)
2150 raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
2151
2152 # uploader
2153 video_uploader = try_get(
2154 video_info, lambda x: x['author'][0],
2155 compat_str) or str_or_none(video_details.get('author'))
2156 if video_uploader:
2157 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
2158 else:
2159 self._downloader.report_warning('unable to extract uploader name')
2160
2161 # uploader_id
2162 video_uploader_id = None
2163 video_uploader_url = None
2164 mobj = re.search(
2165 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
2166 video_webpage)
2167 if mobj is not None:
2168 video_uploader_id = mobj.group('uploader_id')
2169 video_uploader_url = mobj.group('uploader_url')
2170 else:
2171 owner_profile_url = url_or_none(microformat.get('ownerProfileUrl'))
2172 if owner_profile_url:
2173 video_uploader_id = self._search_regex(
2174 r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id',
2175 default=None)
2176 video_uploader_url = owner_profile_url
2177
2178 channel_id = (
2179 str_or_none(video_details.get('channelId'))
2180 or self._html_search_meta(
2181 'channelId', video_webpage, 'channel id', default=None)
2182 or self._search_regex(
2183 r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
2184 video_webpage, 'channel id', default=None, group='id'))
2185 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
2186
2187 thumbnails = []
2188 thumbnails_list = try_get(
2189 video_details, lambda x: x['thumbnail']['thumbnails'], list) or []
2190 for t in thumbnails_list:
2191 if not isinstance(t, dict):
2192 continue
2193 thumbnail_url = url_or_none(t.get('url'))
2194 if not thumbnail_url:
2195 continue
2196 thumbnails.append({
2197 'url': thumbnail_url,
2198 'width': int_or_none(t.get('width')),
2199 'height': int_or_none(t.get('height')),
2200 })
2201
2202 if not thumbnails:
2203 video_thumbnail = None
2204 # We try first to get a high quality image:
2205 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
2206 video_webpage, re.DOTALL)
2207 if m_thumb is not None:
2208 video_thumbnail = m_thumb.group(1)
2209 thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str)
2210 if thumbnail_url:
2211 video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url)
2212 if video_thumbnail:
2213 thumbnails.append({'url': video_thumbnail})
2214
2215 # upload date
2216 upload_date = self._html_search_meta(
2217 'datePublished', video_webpage, 'upload date', default=None)
2218 if not upload_date:
2219 upload_date = self._search_regex(
2220 [r'(?s)id="eow-date.*?>(.*?)</span>',
2221 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
2222 video_webpage, 'upload date', default=None)
2223 if not upload_date:
2224 upload_date = microformat.get('publishDate') or microformat.get('uploadDate')
2225 upload_date = unified_strdate(upload_date)
2226
2227 video_license = self._html_search_regex(
2228 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
2229 video_webpage, 'license', default=None)
2230
2231 m_music = re.search(
2232 r'''(?x)
2233 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
2234 <ul[^>]*>\s*
2235 <li>(?P<title>.+?)
2236 by (?P<creator>.+?)
2237 (?:
2238 \(.+?\)|
2239 <a[^>]*
2240 (?:
2241 \bhref=["\']/red[^>]*>| # drop possible
2242 >\s*Listen ad-free with YouTube Red # YouTube Red ad
2243 )
2244 .*?
2245 )?</li
2246 ''',
2247 video_webpage)
2248 if m_music:
2249 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
2250 video_creator = clean_html(m_music.group('creator'))
2251 else:
2252 video_alt_title = video_creator = None
2253
2254 def extract_meta(field):
2255 return self._html_search_regex(
2256 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
2257 video_webpage, field, default=None)
2258
2259 track = extract_meta('Song')
2260 artist = extract_meta('Artist')
2261 album = extract_meta('Album')
2262
2263 # Youtube Music Auto-generated description
2264 release_date = release_year = None
2265 if video_description:
2266 mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description)
2267 if mobj:
2268 if not track:
2269 track = mobj.group('track').strip()
2270 if not artist:
2271 artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))
2272 if not album:
2273 album = mobj.group('album'.strip())
2274 release_year = mobj.group('release_year')
2275 release_date = mobj.group('release_date')
2276 if release_date:
2277 release_date = release_date.replace('-', '')
2278 if not release_year:
2279 release_year = int(release_date[:4])
2280 if release_year:
2281 release_year = int(release_year)
2282
2283 yt_initial = self._get_yt_initial_data(video_id, video_webpage)
2284 if yt_initial:
2285 music_metadata = self._get_music_metadata_from_yt_initial(yt_initial)
2286 if len(music_metadata):
2287 album = music_metadata[0].get('album')
2288 artist = music_metadata[0].get('artist')
2289 track = music_metadata[0].get('track')
2290
2291 m_episode = re.search(
2292 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
2293 video_webpage)
2294 if m_episode:
2295 series = unescapeHTML(m_episode.group('series'))
2296 season_number = int(m_episode.group('season'))
2297 episode_number = int(m_episode.group('episode'))
2298 else:
2299 series = season_number = episode_number = None
2300
2301 m_cat_container = self._search_regex(
2302 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
2303 video_webpage, 'categories', default=None)
2304 category = None
2305 if m_cat_container:
2306 category = self._html_search_regex(
2307 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
2308 default=None)
2309 if not category:
2310 category = try_get(
2311 microformat, lambda x: x['category'], compat_str)
2312 video_categories = None if category is None else [category]
2313
2314 video_tags = [
2315 unescapeHTML(m.group('content'))
2316 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
2317 if not video_tags:
2318 video_tags = try_get(video_details, lambda x: x['keywords'], list)
2319
2320 def _extract_count(count_name):
2321 return str_to_int(self._search_regex(
2322 r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
2323 % re.escape(count_name),
2324 video_webpage, count_name, default=None))
2325
2326 like_count = _extract_count('like')
2327 dislike_count = _extract_count('dislike')
2328
2329 if view_count is None:
2330 view_count = str_to_int(self._search_regex(
2331 r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
2332 'view count', default=None))
2333
2334 average_rating = (
2335 float_or_none(video_details.get('averageRating'))
2336 or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
2337
2338 # subtitles
2339 video_subtitles = self.extract_subtitles(
2340 video_id, video_webpage, has_live_chat_replay)
2341 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
2342
2343 video_duration = try_get(
2344 video_info, lambda x: int_or_none(x['length_seconds'][0]))
2345 if not video_duration:
2346 video_duration = int_or_none(video_details.get('lengthSeconds'))
2347 if not video_duration:
2348 video_duration = parse_duration(self._html_search_meta(
2349 'duration', video_webpage, 'video duration'))
2350
2351 # Get Subscriber Count of channel
2352 subscriber_count = parse_count(self._search_regex(
2353 r'"text":"([\d\.]+\w?) subscribers"',
2354 video_webpage,
2355 'subscriber count',
2356 default=None
2357 ))
2358
2359 # annotations
2360 video_annotations = None
2361 if self._downloader.params.get('writeannotations', False):
2362 xsrf_token = self._search_regex(
2363 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2',
2364 video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
2365 invideo_url = try_get(
2366 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
2367 if xsrf_token and invideo_url:
2368 xsrf_field_name = self._search_regex(
2369 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
2370 video_webpage, 'xsrf field name',
2371 group='xsrf_field_name', default='session_token')
2372 video_annotations = self._download_webpage(
2373 self._proto_relative_url(invideo_url),
2374 video_id, note='Downloading annotations',
2375 errnote='Unable to download video annotations', fatal=False,
2376 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
2377
2378 chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
2379
2380 # Look for the DASH manifest
2381 if self._downloader.params.get('youtube_include_dash_manifest', True):
2382 dash_mpd_fatal = True
2383 for mpd_url in dash_mpds:
2384 dash_formats = {}
2385 try:
2386 def decrypt_sig(mobj):
2387 s = mobj.group(1)
2388 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2389 return '/signature/%s' % dec_s
2390
2391 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2392
2393 for df in self._extract_mpd_formats(
2394 mpd_url, video_id, fatal=dash_mpd_fatal,
2395 formats_dict=self._formats):
2396 if not df.get('filesize'):
2397 df['filesize'] = _extract_filesize(df['url'])
2398 # Do not overwrite DASH format found in some previous DASH manifest
2399 if df['format_id'] not in dash_formats:
2400 dash_formats[df['format_id']] = df
2401 # Additional DASH manifests may end up in HTTP Error 403 therefore
2402 # allow them to fail without bug report message if we already have
2403 # some DASH manifest succeeded. This is temporary workaround to reduce
2404 # burst of bug reports until we figure out the reason and whether it
2405 # can be fixed at all.
2406 dash_mpd_fatal = False
2407 except (ExtractorError, KeyError) as e:
2408 self.report_warning(
2409 'Skipping DASH manifest: %r' % e, video_id)
2410 if dash_formats:
2411 # Remove the formats we found through non-DASH, they
2412 # contain less info and it can be wrong, because we use
2413 # fixed values (for example the resolution). See
2414 # https://github.com/ytdl-org/youtube-dl/issues/5774 for an
2415 # example.
2416 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
2417 formats.extend(dash_formats.values())
2418
2419 # Check for malformed aspect ratio
2420 stretched_m = re.search(
2421 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2422 video_webpage)
2423 if stretched_m:
2424 w = float(stretched_m.group('w'))
2425 h = float(stretched_m.group('h'))
2426 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2427 # We will only process correct ratios.
2428 if w > 0 and h > 0:
2429 ratio = w / h
2430 for f in formats:
2431 if f.get('vcodec') != 'none':
2432 f['stretched_ratio'] = ratio
2433
2434 if not formats:
2435 if 'reason' in video_info:
2436 if 'The uploader has not made this video available in your country.' in video_info['reason']:
2437 regions_allowed = self._html_search_meta(
2438 'regionsAllowed', video_webpage, default=None)
2439 countries = regions_allowed.split(',') if regions_allowed else None
2440 self.raise_geo_restricted(
2441 msg=video_info['reason'][0], countries=countries)
2442 reason = video_info['reason'][0]
2443 if 'Invalid parameters' in reason:
2444 unavailable_message = extract_unavailable_message()
2445 if unavailable_message:
2446 reason = unavailable_message
2447 raise ExtractorError(
2448 'YouTube said: %s' % reason,
2449 expected=True, video_id=video_id)
2450 if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']):
2451 raise ExtractorError('This video is DRM protected.', expected=True)
2452
2453 self._sort_formats(formats)
2454
2455 self.mark_watched(video_id, video_info, player_response)
2456
2457 return {
2458 'id': video_id,
2459 'uploader': video_uploader,
2460 'uploader_id': video_uploader_id,
2461 'uploader_url': video_uploader_url,
2462 'channel_id': channel_id,
2463 'channel_url': channel_url,
2464 'upload_date': upload_date,
2465 'license': video_license,
2466 'creator': video_creator or artist,
2467 'title': video_title,
2468 'alt_title': video_alt_title or track,
2469 'thumbnails': thumbnails,
2470 'description': video_description,
2471 'categories': video_categories,
2472 'tags': video_tags,
2473 'subtitles': video_subtitles,
2474 'automatic_captions': automatic_captions,
2475 'duration': video_duration,
2476 'age_limit': 18 if age_gate else 0,
2477 'annotations': video_annotations,
2478 'chapters': chapters,
2479 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
2480 'view_count': view_count,
2481 'like_count': like_count,
2482 'dislike_count': dislike_count,
2483 'average_rating': average_rating,
2484 'formats': formats,
2485 'is_live': is_live,
2486 'start_time': start_time,
2487 'end_time': end_time,
2488 'series': series,
2489 'season_number': season_number,
2490 'episode_number': episode_number,
2491 'track': track,
2492 'artist': artist,
2493 'album': album,
2494 'release_date': release_date,
2495 'release_year': release_year,
2496 'subscriber_count': subscriber_count,
2497 }
2498
2499
2500class YoutubeTabIE(YoutubeBaseInfoExtractor):
2501 IE_DESC = 'YouTube.com tab'
2502 # (?x)^ will cause warning in LiveIE. So I cant split this into multiple lines using '''
2503 _VALID_URL = (
2504 r'https?://(?:\w+\.)?(?:youtube(?:kids)?\.com|invidio\.us)/'
2505 r'(?:(?!(%s)([/#?]|$))|'
2506 r'(?:channel|c|user)/|'
2507 r'(?:playlist|watch)\?.*?\blist=)'
2508 r'(?P<id>[^/?#&]+)') % YoutubeBaseInfoExtractor._RESERVED_NAMES
2509 IE_NAME = 'youtube:tab'
2510
2511 _TESTS = [{
2512 # playlists, multipage
2513 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid',
2514 'playlist_mincount': 94,
2515 'info_dict': {
2516 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2517 'title': 'Игорь Клейнер - Playlists',
2518 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
2519 },
2520 }, {
2521 # playlists, multipage, different order
2522 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
2523 'playlist_mincount': 94,
2524 'info_dict': {
2525 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2526 'title': 'Игорь Клейнер - Playlists',
2527 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
2528 },
2529 }, {
2530 # playlists, singlepage
2531 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
2532 'playlist_mincount': 4,
2533 'info_dict': {
2534 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
2535 'title': 'ThirstForScience - Playlists',
2536 'description': 'md5:609399d937ea957b0f53cbffb747a14c',
2537 }
2538 }, {
2539 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
2540 'only_matching': True,
2541 }, {
2542 # basic, single video playlist
2543 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2544 'info_dict': {
2545 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2546 'uploader': 'Sergey M.',
2547 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2548 'title': 'youtube-dl public playlist',
2549 },
2550 'playlist_count': 1,
2551 }, {
2552 # empty playlist
2553 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2554 'info_dict': {
2555 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2556 'uploader': 'Sergey M.',
2557 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2558 'title': 'youtube-dl empty playlist',
2559 },
2560 'playlist_count': 0,
2561 }, {
2562 # Home tab
2563 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured',
2564 'info_dict': {
2565 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2566 'title': 'lex will - Home',
2567 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2568 },
2569 'playlist_mincount': 2,
2570 }, {
2571 # Videos tab
2572 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos',
2573 'info_dict': {
2574 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2575 'title': 'lex will - Videos',
2576 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2577 },
2578 'playlist_mincount': 975,
2579 }, {
2580 # Videos tab, sorted by popular
2581 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid',
2582 'info_dict': {
2583 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2584 'title': 'lex will - Videos',
2585 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2586 },
2587 'playlist_mincount': 199,
2588 }, {
2589 # Playlists tab
2590 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists',
2591 'info_dict': {
2592 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2593 'title': 'lex will - Playlists',
2594 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2595 },
2596 'playlist_mincount': 17,
2597 }, {
2598 # Community tab
2599 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community',
2600 'info_dict': {
2601 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2602 'title': 'lex will - Community',
2603 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2604 },
2605 'playlist_mincount': 18,
2606 }, {
2607 # Channels tab
2608 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels',
2609 'info_dict': {
2610 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2611 'title': 'lex will - Channels',
2612 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2613 },
2614 'playlist_mincount': 138,
2615 }, {
2616 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',
2617 'only_matching': True,
2618 }, {
2619 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA',
2620 'only_matching': True,
2621 }, {
2622 'url': 'https://music.youtube.com/channel/UCT-K0qO8z6NzWrywqefBPBQ',
2623 'only_matching': True,
2624 }, {
2625 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2626 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2627 'info_dict': {
2628 'title': '29C3: Not my department',
2629 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2630 'uploader': 'Christiaan008',
2631 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
2632 },
2633 'playlist_count': 96,
2634 }, {
2635 'note': 'Large playlist',
2636 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
2637 'info_dict': {
2638 'title': 'Uploads from Cauchemar',
2639 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
2640 'uploader': 'Cauchemar',
2641 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
2642 },
2643 'playlist_mincount': 1123,
2644 }, {
2645 # even larger playlist, 8832 videos
2646 'url': 'http://www.youtube.com/user/NASAgovVideo/videos',
2647 'only_matching': True,
2648 }, {
2649 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2650 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2651 'info_dict': {
2652 'title': 'Uploads from Interstellar Movie',
2653 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
2654 'uploader': 'Interstellar Movie',
2655 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
2656 },
2657 'playlist_mincount': 21,
2658 }, {
2659 # https://github.com/ytdl-org/youtube-dl/issues/21844
2660 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2661 'info_dict': {
2662 'title': 'Data Analysis with Dr Mike Pound',
2663 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2664 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
2665 'uploader': 'Computerphile',
2666 },
2667 'playlist_mincount': 11,
2668 }, {
2669 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',
2670 'only_matching': True,
2671 }, {
2672 # Playlist URL that does not actually serve a playlist
2673 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2674 'info_dict': {
2675 'id': 'FqZTN594JQw',
2676 'ext': 'webm',
2677 'title': "Smiley's People 01 detective, Adventure Series, Action",
2678 'uploader': 'STREEM',
2679 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
2680 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
2681 'upload_date': '20150526',
2682 'license': 'Standard YouTube License',
2683 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2684 'categories': ['People & Blogs'],
2685 'tags': list,
2686 'view_count': int,
2687 'like_count': int,
2688 'dislike_count': int,
2689 },
2690 'params': {
2691 'skip_download': True,
2692 },
2693 'skip': 'This video is not available.',
2694 'add_ie': [YoutubeIE.ie_key()],
2695 }, {
2696 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
2697 'only_matching': True,
2698 }, {
2699 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
2700 'only_matching': True,
2701 }]
2702
2703 @classmethod
2704 def suitable(cls, url):
2705 IGNORE = (YoutubeLiveIE,)
2706 return (
2707 False if any(ie.suitable(url) for ie in IGNORE)
2708 else super(YoutubeTabIE, cls).suitable(url))
2709
2710 def _extract_channel_id(self, webpage):
2711 channel_id = self._html_search_meta(
2712 'channelId', webpage, 'channel id', default=None)
2713 if channel_id:
2714 return channel_id
2715 channel_url = self._html_search_meta(
2716 ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url',
2717 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad',
2718 'twitter:app:url:googleplay'), webpage, 'channel url')
2719 return self._search_regex(
2720 r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+',
2721 channel_url, 'channel id')
2722
2723 @staticmethod
2724 def _extract_grid_item_renderer(item):
2725 for item_kind in ('Playlist', 'Video', 'Channel'):
2726 renderer = item.get('grid%sRenderer' % item_kind)
2727 if renderer:
2728 return renderer
2729
2730 def _extract_video(self, renderer):
2731 video_id = renderer.get('videoId')
2732 title = try_get(
2733 renderer,
2734 (lambda x: x['title']['runs'][0]['text'],
2735 lambda x: x['title']['simpleText']), compat_str)
2736 description = try_get(
2737 renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'],
2738 compat_str)
2739 duration = parse_duration(try_get(
2740 renderer, lambda x: x['lengthText']['simpleText'], compat_str))
2741 view_count_text = try_get(
2742 renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
2743 view_count = str_to_int(self._search_regex(
2744 r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
2745 'view count', default=None))
2746 uploader = try_get(
2747 renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
2748 return {
2749 '_type': 'url_transparent',
2750 'ie_key': YoutubeIE.ie_key(),
2751 'id': video_id,
2752 'url': video_id,
2753 'title': title,
2754 'description': description,
2755 'duration': duration,
2756 'view_count': view_count,
2757 'uploader': uploader,
2758 }
2759
2760 def _grid_entries(self, grid_renderer):
2761 for item in grid_renderer['items']:
2762 if not isinstance(item, dict):
2763 continue
2764 renderer = self._extract_grid_item_renderer(item)
2765 if not isinstance(renderer, dict):
2766 continue
2767 title = try_get(
2768 renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
2769 # playlist
2770 playlist_id = renderer.get('playlistId')
2771 if playlist_id:
2772 yield self.url_result(
2773 'https://www.youtube.com/playlist?list=%s' % playlist_id,
2774 ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
2775 video_title=title)
2776 # video
2777 video_id = renderer.get('videoId')
2778 if video_id:
2779 yield self._extract_video(renderer)
2780 # channel
2781 channel_id = renderer.get('channelId')
2782 if channel_id:
2783 title = try_get(
2784 renderer, lambda x: x['title']['simpleText'], compat_str)
2785 yield self.url_result(
2786 'https://www.youtube.com/channel/%s' % channel_id,
2787 ie=YoutubeTabIE.ie_key(), video_title=title)
2788
2789 def _shelf_entries_trimmed(self, shelf_renderer):
2790 renderer = try_get(
2791 shelf_renderer, lambda x: x['content']['horizontalListRenderer'], dict)
2792 if not renderer:
2793 return
2794 # TODO: add support for nested playlists so each shelf is processed
2795 # as separate playlist
2796 # TODO: this includes only first N items
2797 for entry in self._grid_entries(renderer):
2798 yield entry
2799
2800 def _shelf_entries(self, shelf_renderer):
2801 ep = try_get(
2802 shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
2803 compat_str)
2804 shelf_url = urljoin('https://www.youtube.com', ep)
2805 if not shelf_url:
2806 return
2807 title = try_get(
2808 shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
2809 yield self.url_result(shelf_url, video_title=title)
2810
2811 def _playlist_entries(self, video_list_renderer):
2812 for content in video_list_renderer['contents']:
2813 if not isinstance(content, dict):
2814 continue
2815 renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer')
2816 if not isinstance(renderer, dict):
2817 continue
2818 video_id = renderer.get('videoId')
2819 if not video_id:
2820 continue
2821 yield self._extract_video(renderer)
2822
2823 def _itemSection_entries(self, item_sect_renderer):
2824 for content in item_sect_renderer['contents']:
2825 if not isinstance(content, dict):
2826 continue
2827 renderer = content.get('videoRenderer', {})
2828 if not isinstance(renderer, dict):
2829 continue
2830 video_id = renderer.get('videoId')
2831 if not video_id:
2832 continue
2833 yield self._extract_video(renderer)
2834
2835 def _rich_entries(self, rich_grid_renderer):
2836 renderer = try_get(
2837 rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict)
2838 video_id = renderer.get('videoId')
2839 if not video_id:
2840 return
2841 yield self._extract_video(renderer)
2842
2843 def _video_entry(self, video_renderer):
2844 video_id = video_renderer.get('videoId')
2845 if video_id:
2846 return self._extract_video(video_renderer)
2847
2848 def _post_thread_entries(self, post_thread_renderer):
2849 post_renderer = try_get(
2850 post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict)
2851 if not post_renderer:
2852 return
2853 # video attachment
2854 video_renderer = try_get(
2855 post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict)
2856 video_id = None
2857 if video_renderer:
2858 entry = self._video_entry(video_renderer)
2859 if entry:
2860 yield entry
2861 # inline video links
2862 runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or []
2863 for run in runs:
2864 if not isinstance(run, dict):
2865 continue
2866 ep_url = try_get(
2867 run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str)
2868 if not ep_url:
2869 continue
2870 if not YoutubeIE.suitable(ep_url):
2871 continue
2872 ep_video_id = YoutubeIE._match_id(ep_url)
2873 if video_id == ep_video_id:
2874 continue
2875 yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=video_id)
2876
2877 def _post_thread_continuation_entries(self, post_thread_continuation):
2878 contents = post_thread_continuation.get('contents')
2879 if not isinstance(contents, list):
2880 return
2881 for content in contents:
2882 renderer = content.get('backstagePostThreadRenderer')
2883 if not isinstance(renderer, dict):
2884 continue
2885 for entry in self._post_thread_entries(renderer):
2886 yield entry
2887
2888 @staticmethod
2889 def _extract_next_continuation_data(renderer):
2890 next_continuation = try_get(
2891 renderer, lambda x: x['continuations'][0]['nextContinuationData'], dict)
2892 if not next_continuation:
2893 return
2894 continuation = next_continuation.get('continuation')
2895 if not continuation:
2896 return
2897 ctp = next_continuation.get('clickTrackingParams')
2898 return {
2899 'ctoken': continuation,
2900 'continuation': continuation,
2901 'itct': ctp,
2902 }
2903
2904 @classmethod
2905 def _extract_continuation(cls, renderer):
2906 next_continuation = cls._extract_next_continuation_data(renderer)
2907 if next_continuation:
2908 return next_continuation
2909 contents = renderer.get('contents')
2910 if not isinstance(contents, list):
2911 return
2912 for content in contents:
2913 if not isinstance(content, dict):
2914 continue
2915 continuation_ep = try_get(
2916 content, lambda x: x['continuationItemRenderer']['continuationEndpoint'],
2917 dict)
2918 if not continuation_ep:
2919 continue
2920 continuation = try_get(
2921 continuation_ep, lambda x: x['continuationCommand']['token'], compat_str)
2922 if not continuation:
2923 continue
2924 ctp = continuation_ep.get('clickTrackingParams')
2925 if not ctp:
2926 continue
2927 return {
2928 'ctoken': continuation,
2929 'continuation': continuation,
2930 'itct': ctp,
2931 }
2932
2933 def _entries(self, tab, identity_token):
2934
2935 def extract_entries(parent_renderer):
2936 slr_contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
2937 for slr_content in slr_contents:
2938 if not isinstance(slr_content, dict):
2939 continue
2940 is_renderer = try_get(slr_content, lambda x: x['itemSectionRenderer'], dict)
2941 if not is_renderer:
2942 renderer = slr_content.get('richItemRenderer')
2943 if renderer:
2944 for entry in self._rich_entries(renderer):
2945 yield entry
2946 continuation_list[0] = self._extract_continuation(parent_renderer)
2947 continue
2948 isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
2949 for isr_content in isr_contents:
2950 if not isinstance(isr_content, dict):
2951 continue
2952 renderer = isr_content.get('playlistVideoListRenderer')
2953 if renderer:
2954 for entry in self._playlist_entries(renderer):
2955 yield entry
2956 continuation_list[0] = self._extract_continuation(renderer)
2957 continue
2958 renderer = isr_content.get('gridRenderer')
2959 if renderer:
2960 for entry in self._grid_entries(renderer):
2961 yield entry
2962 continuation_list[0] = self._extract_continuation(renderer)
2963 continue
2964 renderer = isr_content.get('shelfRenderer')
2965 if renderer:
2966 for entry in self._shelf_entries(renderer):
2967 yield entry
2968 continuation_list[0] = self._extract_continuation(parent_renderer)
2969 continue
2970 renderer = isr_content.get('backstagePostThreadRenderer')
2971 if renderer:
2972 for entry in self._post_thread_entries(renderer):
2973 yield entry
2974 continuation_list[0] = self._extract_continuation(renderer)
2975 continue
2976 renderer = isr_content.get('videoRenderer')
2977 if renderer:
2978 entry = self._video_entry(renderer)
2979 if entry:
2980 yield entry
2981 if not continuation_list[0]:
2982 continuation_list[0] = self._extract_continuation(is_renderer)
2983 if not continuation_list[0]:
2984 continuation_list[0] = self._extract_continuation(parent_renderer)
2985
2986 continuation_list = [None] # Python 2 doesnot support nonlocal
2987 parent_renderer = (
2988 try_get(tab, lambda x: x['sectionListRenderer'], dict)
2989 or try_get(tab, lambda x: x['richGridRenderer'], dict) or {})
2990 if parent_renderer:
2991 for entry in extract_entries(parent_renderer):
2992 yield entry
2993
2994 continuation = continuation_list[0]
2995
2996 headers = {
2997 'x-youtube-client-name': '1',
2998 'x-youtube-client-version': '2.20201112.04.01',
2999 }
3000 if identity_token:
3001 headers['x-youtube-identity-token'] = identity_token
3002
3003 for page_num in itertools.count(1):
3004 if not continuation:
3005 break
3006 if hasattr(self, '_MAX_PAGES') and page_num > self._MAX_PAGES:
3007 break
3008 browse = self._download_json(
3009 'https://www.youtube.com/browse_ajax', None,
3010 'Downloading page %d' % page_num,
3011 headers=headers, query=continuation, fatal=False)
3012 if not browse:
3013 break
3014 response = try_get(browse, lambda x: x[1]['response'], dict)
3015 if not response:
3016 break
3017
3018 continuation_contents = try_get(
3019 response, lambda x: x['continuationContents'], dict)
3020 if continuation_contents:
3021 continuation_renderer = continuation_contents.get('playlistVideoListContinuation')
3022 if continuation_renderer:
3023 for entry in self._playlist_entries(continuation_renderer):
3024 yield entry
3025 continuation = self._extract_continuation(continuation_renderer)
3026 continue
3027 continuation_renderer = continuation_contents.get('gridContinuation')
3028 if continuation_renderer:
3029 for entry in self._grid_entries(continuation_renderer):
3030 yield entry
3031 continuation = self._extract_continuation(continuation_renderer)
3032 continue
3033 continuation_renderer = continuation_contents.get('itemSectionContinuation')
3034 if continuation_renderer:
3035 for entry in self._post_thread_continuation_entries(continuation_renderer):
3036 yield entry
3037 continuation = self._extract_continuation(continuation_renderer)
3038 continue
3039 continuation_renderer = continuation_contents.get('sectionListContinuation')
3040 if continuation_renderer:
3041 continuation_list = [None]
3042 for entry in extract_entries(continuation_renderer):
3043 yield entry
3044 continuation = continuation_list[0]
3045 continue
3046
3047 continuation_items = try_get(
3048 response, lambda x: x['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems'], list)
3049 if continuation_items:
3050 continuation_item = continuation_items[0]
3051 if not isinstance(continuation_item, dict):
3052 continue
3053 renderer = continuation_item.get('playlistVideoRenderer')
3054 if renderer:
3055 video_list_renderer = {'contents': continuation_items}
3056 for entry in self._playlist_entries(video_list_renderer):
3057 yield entry
3058 continuation = self._extract_continuation(video_list_renderer)
3059 continue
3060 renderer = continuation_item.get('itemSectionRenderer')
3061 if renderer:
3062 for entry in self._itemSection_entries(renderer):
3063 yield entry
3064 continuation = self._extract_continuation({'contents': continuation_items})
3065 continue
3066 break
3067
3068 @staticmethod
3069 def _extract_selected_tab(tabs):
3070 for tab in tabs:
3071 if try_get(tab, lambda x: x['tabRenderer']['selected'], bool):
3072 return tab['tabRenderer']
3073 else:
3074 raise ExtractorError('Unable to find selected tab')
3075
3076 @staticmethod
3077 def _extract_uploader(data):
3078 uploader = {}
3079 sidebar_renderer = try_get(
3080 data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list)
3081 if sidebar_renderer:
3082 for item in sidebar_renderer:
3083 if not isinstance(item, dict):
3084 continue
3085 renderer = item.get('playlistSidebarSecondaryInfoRenderer')
3086 if not isinstance(renderer, dict):
3087 continue
3088 owner = try_get(
3089 renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict)
3090 if owner:
3091 uploader['uploader'] = owner.get('text')
3092 uploader['uploader_id'] = try_get(
3093 owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str)
3094 uploader['uploader_url'] = urljoin(
3095 'https://www.youtube.com/',
3096 try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
3097 return uploader
3098
3099 def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
3100 selected_tab = self._extract_selected_tab(tabs)
3101 renderer = try_get(
3102 data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
3103 playlist_id = None
3104 if renderer:
3105 channel_title = renderer.get('title') or item_id
3106 tab_title = selected_tab.get('title')
3107 title = channel_title or item_id
3108 if tab_title:
3109 title += ' - %s' % tab_title
3110 description = renderer.get('description')
3111 playlist_id = renderer.get('externalId')
3112 renderer = try_get(
3113 data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
3114 if renderer:
3115 title = renderer.get('title')
3116 description = None
3117 playlist_id = item_id
3118 if playlist_id is None:
3119 return None
3120 playlist = self.playlist_result(
3121 self._entries(selected_tab['content'], identity_token),
3122 playlist_id=playlist_id, playlist_title=title,
3123 playlist_description=description)
3124 playlist.update(self._extract_uploader(data))
3125 return playlist
3126
3127 def _extract_from_playlist(self, item_id, data, playlist):
3128 title = playlist.get('title') or try_get(
3129 data, lambda x: x['titleText']['simpleText'], compat_str)
3130 playlist_id = playlist.get('playlistId') or item_id
3131 return self.playlist_result(
3132 self._playlist_entries(playlist), playlist_id=playlist_id,
3133 playlist_title=title)
3134
3135 def _real_extract(self, url):
3136 item_id = self._match_id(url)
3137 url = compat_urlparse.urlunparse(
3138 compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
3139 # Handle both video/playlist URLs
3140 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3141 video_id = qs.get('v', [None])[0]
3142 playlist_id = qs.get('list', [None])[0]
3143 if video_id and playlist_id:
3144 if self._downloader.params.get('noplaylist'):
3145 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
3146 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
3147 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
3148 webpage = self._download_webpage(url, item_id)
3149 identity_token = self._search_regex(
3150 r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
3151 'identity token', default=None)
3152 data = self._extract_yt_initial_data(item_id, webpage)
3153 tabs = try_get(
3154 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
3155 if tabs:
3156 return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token)
3157 playlist = try_get(
3158 data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
3159 if playlist:
3160 return self._extract_from_playlist(item_id, data, playlist)
3161 # Fallback to video extraction if no playlist alike page is recognized
3162 if video_id:
3163 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
3164 # Failed to recognize
3165 raise ExtractorError('Unable to recognize tab page')
3166
3167
3168class YoutubePlaylistIE(InfoExtractor):
3169 IE_DESC = 'YouTube.com playlists'
3170 _VALID_URL = r'''(?x)(?:
3171 (?:https?://)?
3172 (?:\w+\.)?
3173 (?:
3174 (?:
3175 youtube(?:kids)?\.com|
3176 invidio\.us|
3177 youtu\.be
3178 )
3179 /.*?\?.*?\blist=
3180 )?
3181 (?P<id>%(playlist_id)s)
3182 )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
3183 IE_NAME = 'youtube:playlist'
3184 _TESTS = [{
3185 'note': 'issue #673',
3186 'url': 'PLBB231211A4F62143',
3187 'info_dict': {
3188 'title': '[OLD]Team Fortress 2 (Class-based LP)',
3189 'id': 'PLBB231211A4F62143',
3190 'uploader': 'Wickydoo',
3191 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
3192 },
3193 'playlist_mincount': 29,
3194 }, {
3195 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
3196 'info_dict': {
3197 'title': 'YDL_safe_search',
3198 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
3199 },
3200 'playlist_count': 2,
3201 'skip': 'This playlist is private',
3202 }, {
3203 'note': 'embedded',
3204 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
3205 'playlist_count': 4,
3206 'info_dict': {
3207 'title': 'JODA15',
3208 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
3209 'uploader': 'milan',
3210 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
3211 }
3212 }, {
3213 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
3214 'playlist_mincount': 982,
3215 'info_dict': {
3216 'title': '2018 Chinese New Singles (11/6 updated)',
3217 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
3218 'uploader': 'LBK',
3219 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
3220 }
3221 }, {
3222 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
3223 'info_dict': {
3224 'id': 'yeWKywCrFtk',
3225 'ext': 'mp4',
3226 'title': 'Small Scale Baler and Braiding Rugs',
3227 'uploader': 'Backus-Page House Museum',
3228 'uploader_id': 'backuspagemuseum',
3229 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
3230 'upload_date': '20161008',
3231 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
3232 'categories': ['Nonprofits & Activism'],
3233 'tags': list,
3234 'like_count': int,
3235 'dislike_count': int,
3236 },
3237 'params': {
3238 'noplaylist': True,
3239 'skip_download': True,
3240 },
3241 }, {
3242 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
3243 'only_matching': True,
3244 }, {
3245 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
3246 'only_matching': True,
3247 }, {
3248 # music album playlist
3249 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
3250 'only_matching': True,
3251 }]
3252
3253 @classmethod
3254 def suitable(cls, url):
3255 return False if YoutubeTabIE.suitable(url) else super(
3256 YoutubePlaylistIE, cls).suitable(url)
3257
3258 def _real_extract(self, url):
3259 playlist_id = self._match_id(url)
3260 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3261 if not qs:
3262 qs = {'list': playlist_id}
3263 return self.url_result(
3264 update_url_query('https://www.youtube.com/playlist', qs),
3265 ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
3266
3267
3268class YoutubeYtUserIE(InfoExtractor):
3269 _VALID_URL = r'ytuser:(?P<id>.+)'
3270 _TESTS = [{
3271 'url': 'ytuser:phihag',
3272 'only_matching': True,
3273 }]
3274
3275 def _real_extract(self, url):
3276 user_id = self._match_id(url)
3277 return self.url_result(
3278 'https://www.youtube.com/user/%s' % user_id,
3279 ie=YoutubeTabIE.ie_key(), video_id=user_id)
3280
3281
3282class YoutubeLiveIE(YoutubeBaseInfoExtractor):
3283 IE_DESC = 'YouTube.com live streams'
3284 _VALID_URL = r'(?P<base_url>%s)/live' % YoutubeTabIE._VALID_URL
3285 IE_NAME = 'youtube:live'
3286
3287 _TESTS = [{
3288 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
3289 'info_dict': {
3290 'id': 'a48o2S1cPoo',
3291 'ext': 'mp4',
3292 'title': 'The Young Turks - Live Main Show',
3293 'uploader': 'The Young Turks',
3294 'uploader_id': 'TheYoungTurks',
3295 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
3296 'upload_date': '20150715',
3297 'license': 'Standard YouTube License',
3298 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
3299 'categories': ['News & Politics'],
3300 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
3301 'like_count': int,
3302 'dislike_count': int,
3303 },
3304 'params': {
3305 'skip_download': True,
3306 },
3307 }, {
3308 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
3309 'only_matching': True,
3310 }, {
3311 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
3312 'only_matching': True,
3313 }, {
3314 'url': 'https://www.youtube.com/TheYoungTurks/live',
3315 'only_matching': True,
3316 }]
3317
3318 def _real_extract(self, url):
3319 mobj = re.match(self._VALID_URL, url)
3320 channel_id = mobj.group('id')
3321 base_url = mobj.group('base_url')
3322 webpage = self._download_webpage(url, channel_id, fatal=False)
3323 if webpage:
3324 page_type = self._og_search_property(
3325 'type', webpage, 'page type', default='')
3326 video_id = self._html_search_meta(
3327 'videoId', webpage, 'video id', default=None)
3328 if page_type.startswith('video') and video_id and re.match(
3329 r'^[0-9A-Za-z_-]{11}$', video_id):
3330 return self.url_result(video_id, YoutubeIE.ie_key())
3331 return self.url_result(base_url)
3332
3333
3334class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):
3335 IE_DESC = 'YouTube.com searches'
3336 # there doesn't appear to be a real limit, for example if you search for
3337 # 'python' you get more than 8.000.000 results
3338 _MAX_RESULTS = float('inf')
3339 IE_NAME = 'youtube:search'
3340 _SEARCH_KEY = 'ytsearch'
3341 _SEARCH_PARAMS = None
3342 _TESTS = []
3343
3344 def _entries(self, query, n):
3345 data = {
3346 'context': {
3347 'client': {
3348 'clientName': 'WEB',
3349 'clientVersion': '2.20201021.03.00',
3350 }
3351 },
3352 'query': query,
3353 }
3354 if self._SEARCH_PARAMS:
3355 data['params'] = self._SEARCH_PARAMS
3356 total = 0
3357 for page_num in itertools.count(1):
3358 search = self._download_json(
3359 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
3360 video_id='query "%s"' % query,
3361 note='Downloading page %s' % page_num,
3362 errnote='Unable to download API page', fatal=False,
3363 data=json.dumps(data).encode('utf8'),
3364 headers={'content-type': 'application/json'})
3365 if not search:
3366 break
3367 slr_contents = try_get(
3368 search,
3369 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
3370 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
3371 list)
3372 if not slr_contents:
3373 break
3374 isr_contents = try_get(
3375 slr_contents,
3376 lambda x: x[0]['itemSectionRenderer']['contents'],
3377 list)
3378 if not isr_contents:
3379 break
3380 for content in isr_contents:
3381 if not isinstance(content, dict):
3382 continue
3383 video = content.get('videoRenderer')
3384 if not isinstance(video, dict):
3385 continue
3386 video_id = video.get('videoId')
3387 if not video_id:
3388 continue
3389 title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str)
3390 description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str)
3391 duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str))
3392 view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
3393 view_count = int_or_none(self._search_regex(
3394 r'^(\d+)', re.sub(r'\s', '', view_count_text),
3395 'view count', default=None))
3396 uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
3397 total += 1
3398 yield {
3399 '_type': 'url_transparent',
3400 'ie_key': YoutubeIE.ie_key(),
3401 'id': video_id,
3402 'url': video_id,
3403 'title': title,
3404 'description': description,
3405 'duration': duration,
3406 'view_count': view_count,
3407 'uploader': uploader,
3408 }
3409 if total == n:
3410 return
3411 token = try_get(
3412 slr_contents,
3413 lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
3414 compat_str)
3415 if not token:
3416 break
3417 data['continuation'] = token
3418
3419 def _get_n_results(self, query, n):
3420 """Get a specified number of results for a query"""
3421 return self.playlist_result(self._entries(query, n), query)
3422
3423
3424class YoutubeSearchDateIE(YoutubeSearchIE):
3425 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
3426 _SEARCH_KEY = 'ytsearchdate'
3427 IE_DESC = 'YouTube.com searches, newest videos first'
3428 _SEARCH_PARAMS = 'CAI%3D'
3429
3430
3431class YoutubeSearchURLIE(InfoExtractor):
3432 IE_DESC = 'YouTube.com search URLs'
3433 IE_NAME = 'youtube:search_url'
3434 _PARAM_REGEX = r''
3435 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results/?(?:\?|\?[^#]*?&)(?:sp=(?P<param1>[^&#]+)&(?:[^#]*&)?)?(?:q|search_query)=(?P<query>[^#&]+)(?:[^#]*?&sp=(?P<param2>[^#&]+))?'
3436 _MAX_RESULTS = 100
3437 _TESTS = [{
3438 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
3439 'playlist_mincount': 5,
3440 'info_dict': {
3441 'title': 'youtube-dl test video',
3442 }
3443 }, {
3444 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
3445 'only_matching': True,
3446 }]
3447
3448 def _real_extract(self, url):
3449 mobj = re.match(self._VALID_URL, url)
3450 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
3451 IE = YoutubeSearchIE(self._downloader)
3452 IE._SEARCH_PARAMS = mobj.group('param1') or mobj.group('param2')
3453 self._downloader.to_screen(IE._SEARCH_PARAMS)
3454 IE._MAX_RESULTS = self._MAX_RESULTS
3455 return IE._get_n_results(query, self._MAX_RESULTS)
3456
3457
3458class YoutubeFeedsInfoExtractor(YoutubeTabIE):
3459 """
3460 Base class for feed extractors
3461 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
3462 """
3463 _LOGIN_REQUIRED = True
3464 _TESTS = []
3465
3466 # _MAX_PAGES = 5
3467
3468 @property
3469 def IE_NAME(self):
3470 return 'youtube:%s' % self._FEED_NAME
3471
3472 def _real_initialize(self):
3473 self._login()
3474
3475 def _shelf_entries(self, shelf_renderer):
3476 renderer = try_get(shelf_renderer, lambda x: x['content']['gridRenderer'], dict)
3477 if not renderer:
3478 return
3479 for entry in self._grid_entries(renderer):
3480 yield entry
3481
3482 def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
3483 selected_tab = self._extract_selected_tab(tabs)
3484 return self.playlist_result(
3485 self._entries(selected_tab['content'], identity_token),
3486 playlist_title=self._PLAYLIST_TITLE)
3487
3488 def _real_extract(self, url):
3489 item_id = self._FEED_NAME
3490 url = 'https://www.youtube.com/feed/%s' % self._FEED_NAME
3491 webpage = self._download_webpage(url, item_id)
3492 identity_token = self._search_regex(
3493 r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
3494 'identity token', default=None)
3495 data = self._extract_yt_initial_data(item_id, webpage)
3496 tabs = try_get(
3497 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
3498 if tabs:
3499 return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token)
3500 # Failed to recognize
3501 raise ExtractorError('Unable to recognize feed page')
3502
3503
3504class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
3505 IE_DESC = 'Youtube watch later list, ":ytwatchlater" or "WL" for short (requires authentication)'
3506 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/watch_later|:ytwatchlater'
3507 _FEED_NAME = 'watchlater'
3508
3509 _TESTS = [{
3510 'url': 'https://www.youtube.com/feed/watch_later',
3511 'only_matching': True,
3512 }, {
3513 'url': ':ytwatchlater',
3514 'only_matching': True,
3515 }]
3516
3517 def _real_extract(self, url):
3518 return self.url_result('WL', ie=YoutubePlaylistIE.ie_key())
3519
3520
3521class YoutubeFavouritesIE(YoutubeFeedsInfoExtractor):
3522 IE_DESC = 'YouTube.com liked videos, ":ytfav" or "LL" for short (requires authentication)'
3523 _VALID_URL = r':ytfav(?:ou?rite)s?'
3524 _FEED_NAME = 'favourites'
3525
3526 _TESTS = [{
3527 'url': ':ytfav',
3528 'only_matching': True,
3529 }]
3530
3531 def _real_extract(self, url):
3532 return self.url_result('LL', ie=YoutubePlaylistIE.ie_key())
3533
3534
3535class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3536 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
3537 _VALID_URL = r'https?://(?:www\.)?youtube\.com(?:/feed/recommended|/?[?#]|/?$)|:ytrec(?:ommended)?'
3538 _FEED_NAME = 'recommended'
3539 _PLAYLIST_TITLE = 'Youtube Recommended videos'
3540
3541
3542class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
3543 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
3544 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsub(?:scription)?s?'
3545 _FEED_NAME = 'subscriptions'
3546 _PLAYLIST_TITLE = 'Youtube Subscriptions'
3547
3548
3549class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
3550 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
3551 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
3552 _FEED_NAME = 'history'
3553 _PLAYLIST_TITLE = 'Youtube History'
3554
3555
3556class YoutubeTruncatedURLIE(InfoExtractor):
3557 IE_NAME = 'youtube:truncated_url'
3558 IE_DESC = False # Do not list
3559 _VALID_URL = r'''(?x)
3560 (?:https?://)?
3561 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3562 (?:watch\?(?:
3563 feature=[a-z_]+|
3564 annotation_id=annotation_[^&]+|
3565 x-yt-cl=[0-9]+|
3566 hl=[^&]*|
3567 t=[0-9]+
3568 )?
3569 |
3570 attribution_link\?a=[^&]+
3571 )
3572 $
3573 '''
3574
3575 _TESTS = [{
3576 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
3577 'only_matching': True,
3578 }, {
3579 'url': 'https://www.youtube.com/watch?',
3580 'only_matching': True,
3581 }, {
3582 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3583 'only_matching': True,
3584 }, {
3585 'url': 'https://www.youtube.com/watch?feature=foo',
3586 'only_matching': True,
3587 }, {
3588 'url': 'https://www.youtube.com/watch?hl=en-GB',
3589 'only_matching': True,
3590 }, {
3591 'url': 'https://www.youtube.com/watch?t=2372',
3592 'only_matching': True,
3593 }]
3594
3595 def _real_extract(self, url):
3596 raise ExtractorError(
3597 'Did you forget to quote the URL? Remember that & is a meta '
3598 'character in most shells, so you want to put the URL in quotes, '
3599 'like youtube-dl '
3600 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3601 ' or simply youtube-dl BaW_jenozKc .',
3602 expected=True)
3603
3604
3605class YoutubeTruncatedIDIE(InfoExtractor):
3606 IE_NAME = 'youtube:truncated_id'
3607 IE_DESC = False # Do not list
3608 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
3609
3610 _TESTS = [{
3611 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3612 'only_matching': True,
3613 }]
3614
3615 def _real_extract(self, url):
3616 video_id = self._match_id(url)
3617 raise ExtractorError(
3618 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3619 expected=True)
3620
3621
3622# Do Youtube show urls even exist anymore? I couldn't find any
3623r'''
3624class YoutubeShowIE(YoutubeTabIE):
3625 IE_DESC = 'YouTube.com (multi-season) shows'
3626 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
3627 IE_NAME = 'youtube:show'
3628 _TESTS = [{
3629 'url': 'https://www.youtube.com/show/airdisasters',
3630 'playlist_mincount': 5,
3631 'info_dict': {
3632 'id': 'airdisasters',
3633 'title': 'Air Disasters',
3634 }
3635 }]
3636
3637 def _real_extract(self, url):
3638 playlist_id = self._match_id(url)
3639 return super(YoutubeShowIE, self)._real_extract(
3640 'https://www.youtube.com/show/%s/playlists' % playlist_id)
3641'''