]> jfr.im git - yt-dlp.git/blame - youtube_dlc/extractor/youtube.py
Implemented all Youtube Feeds (ytfav, ytwatchlater, ytsubs, ythistory, ytrec) and...
[yt-dlp.git] / youtube_dlc / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
d77ab8e2 9import random
c5e8d7af 10import re
42939b61 11import time
e0df6211 12import traceback
c5e8d7af 13
b05654f0 14from .common import InfoExtractor, SearchInfoExtractor
2b25cb5d 15from ..jsinterp import JSInterpreter
54256267 16from ..swfinterp import SWFInterpreter
4bb4a188 17from ..compat import (
edf3e38e 18 compat_chr,
8d81f3e3 19 compat_kwargs,
c5e8d7af 20 compat_parse_qs,
7fd002c0
S
21 compat_urllib_parse_unquote,
22 compat_urllib_parse_unquote_plus,
15707c7e 23 compat_urllib_parse_urlencode,
7c80519c 24 compat_urllib_parse_urlparse,
7c61bd36 25 compat_urlparse,
c5e8d7af 26 compat_str,
4bb4a188
PH
27)
28from ..utils import (
27019dbb 29 bool_or_none,
c5e8d7af 30 clean_html,
9b9c5355 31 error_to_compat_str,
c5e8d7af 32 ExtractorError,
2d30521a 33 float_or_none,
4bb4a188 34 get_element_by_id,
dd27fd17 35 int_or_none,
94278f72 36 mimetype2ext,
6310acf5 37 parse_codecs,
b84071c0 38 parse_count,
7c80519c 39 parse_duration,
0cb58b02 40 remove_quotes,
3995d37d 41 remove_start,
cf7e015f 42 smuggle_url,
dbdaaa23 43 str_or_none,
c93d53f5 44 str_to_int,
556dbe7f 45 try_get,
c5e8d7af
PH
46 unescapeHTML,
47 unified_strdate,
cf7e015f 48 unsmuggle_url,
8bdd16b4 49 update_url_query,
81c2f20b 50 uppercase_escape,
21c340b8 51 url_or_none,
6e6bc8da 52 urlencode_postdata,
8bdd16b4 53 urljoin,
c5e8d7af
PH
54)
55
5f6a1245 56
de7f3446 57class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
58 """Provide base functions for Youtube extractors"""
59 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
9303ce3e 60 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
e00eb564
S
61
62 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
3995d37d
S
63 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
64 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
e00eb564 65
3462ffa8 66 _RESERVED_NAMES = (
67 r'course|embed|watch|w|results|storefront|'
68 r'shared|index|account|reporthistory|t/terms|about|upload|signin|logout|'
69 r'feed/(watch_later|history|subscriptions|library|trending|recommended)')
70
b2e8bc1b
JMF
71 _NETRC_MACHINE = 'youtube'
72 # If True it will raise an error if no login info is provided
73 _LOGIN_REQUIRED = False
74
3462ffa8 75 _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|LL|WL)'
d0ba5587 76
d84b21b4
S
77 _YOUTUBE_CLIENT_HEADERS = {
78 'x-youtube-client-name': '1',
79 'x-youtube-client-version': '1.20200609.04.02',
80 }
81
b2e8bc1b 82 def _set_language(self):
810fb84d 83 self._set_cookie(
ee0b726c 84 '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
42939b61 85 # YouTube sets the expire time to about two months
810fb84d 86 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b 87
25f14e9f
S
88 def _ids_to_results(self, ids):
89 return [
90 self.url_result(vid_id, 'Youtube', video_id=vid_id)
91 for vid_id in ids]
92
b2e8bc1b 93 def _login(self):
83317f69 94 """
95 Attempt to log in to YouTube.
96 True is returned if successful or skipped.
97 False is returned if login failed.
98
99 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
100 """
68217024 101 username, password = self._get_login_info()
b2e8bc1b
JMF
102 # No authentication to be performed
103 if username is None:
70d35d16 104 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
69ea8ca4 105 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
957c523e
U
106 if self._downloader.params.get('cookiefile') and False: # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
107 self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
83317f69 108 return True
b2e8bc1b 109
7cc3570e
PH
110 login_page = self._download_webpage(
111 self._LOGIN_URL, None,
69ea8ca4
PH
112 note='Downloading login page',
113 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
114 if login_page is False:
115 return
b2e8bc1b 116
1212e997 117 login_form = self._hidden_inputs(login_page)
c5e8d7af 118
e00eb564
S
119 def req(url, f_req, note, errnote):
120 data = login_form.copy()
121 data.update({
122 'pstMsg': 1,
123 'checkConnection': 'youtube',
124 'checkedDomains': 'youtube',
125 'hl': 'en',
126 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
3995d37d 127 'f.req': json.dumps(f_req),
e00eb564
S
128 'flowName': 'GlifWebSignIn',
129 'flowEntry': 'ServiceLogin',
baf67a60
S
130 # TODO: reverse actual botguard identifier generation algo
131 'bgRequest': '["identifier",""]',
041bc3ad 132 })
e00eb564
S
133 return self._download_json(
134 url, None, note=note, errnote=errnote,
135 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
136 fatal=False,
137 data=urlencode_postdata(data), headers={
138 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
139 'Google-Accounts-XSRF': 1,
140 })
141
3995d37d
S
142 def warn(message):
143 self._downloader.report_warning(message)
144
145 lookup_req = [
146 username,
147 None, [], None, 'US', None, None, 2, False, True,
148 [
149 None, None,
150 [2, 1, None, 1,
151 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
152 None, [], 4],
153 1, [None, None, []], None, None, None, True
154 ],
155 username,
156 ]
157
e00eb564 158 lookup_results = req(
3995d37d 159 self._LOOKUP_URL, lookup_req,
e00eb564
S
160 'Looking up account info', 'Unable to look up account info')
161
162 if lookup_results is False:
163 return False
041bc3ad 164
3995d37d
S
165 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
166 if not user_hash:
167 warn('Unable to extract user hash')
168 return False
169
170 challenge_req = [
171 user_hash,
172 None, 1, None, [1, None, None, None, [password, None, True]],
173 [
174 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
175 1, [None, None, []], None, None, None, True
176 ]]
83317f69 177
3995d37d
S
178 challenge_results = req(
179 self._CHALLENGE_URL, challenge_req,
180 'Logging in', 'Unable to log in')
83317f69 181
3995d37d 182 if challenge_results is False:
e00eb564 183 return
83317f69 184
3995d37d
S
185 login_res = try_get(challenge_results, lambda x: x[0][5], list)
186 if login_res:
187 login_msg = try_get(login_res, lambda x: x[5], compat_str)
188 warn(
189 'Unable to login: %s' % 'Invalid password'
190 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
191 return False
192
193 res = try_get(challenge_results, lambda x: x[0][-1], list)
194 if not res:
195 warn('Unable to extract result entry')
196 return False
197
9a6628aa
S
198 login_challenge = try_get(res, lambda x: x[0][0], list)
199 if login_challenge:
200 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
201 if challenge_str == 'TWO_STEP_VERIFICATION':
3995d37d
S
202 # SEND_SUCCESS - TFA code has been successfully sent to phone
203 # QUOTA_EXCEEDED - reached the limit of TFA codes
9a6628aa 204 status = try_get(login_challenge, lambda x: x[5], compat_str)
3995d37d
S
205 if status == 'QUOTA_EXCEEDED':
206 warn('Exceeded the limit of TFA codes, try later')
207 return False
208
209 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
210 if not tl:
211 warn('Unable to extract TL')
212 return False
213
214 tfa_code = self._get_tfa_info('2-step verification code')
215
216 if not tfa_code:
217 warn(
218 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
219 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
220 return False
221
222 tfa_code = remove_start(tfa_code, 'G-')
223
224 tfa_req = [
225 user_hash, None, 2, None,
226 [
227 9, None, None, None, None, None, None, None,
228 [None, tfa_code, True, 2]
229 ]]
230
231 tfa_results = req(
232 self._TFA_URL.format(tl), tfa_req,
233 'Submitting TFA code', 'Unable to submit TFA code')
234
235 if tfa_results is False:
236 return False
237
238 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
239 if tfa_res:
240 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
241 warn(
242 'Unable to finish TFA: %s' % 'Invalid TFA code'
243 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
244 return False
245
246 check_cookie_url = try_get(
247 tfa_results, lambda x: x[0][-1][2], compat_str)
9a6628aa
S
248 else:
249 CHALLENGES = {
250 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
251 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
252 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
253 }
254 challenge = CHALLENGES.get(
255 challenge_str,
256 '%s returned error %s.' % (self.IE_NAME, challenge_str))
257 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
258 return False
3995d37d
S
259 else:
260 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
261
262 if not check_cookie_url:
263 warn('Unable to extract CheckCookie URL')
264 return False
e00eb564
S
265
266 check_cookie_results = self._download_webpage(
3995d37d
S
267 check_cookie_url, None, 'Checking cookie', fatal=False)
268
269 if check_cookie_results is False:
270 return False
e00eb564 271
3995d37d
S
272 if 'https://myaccount.google.com/' not in check_cookie_results:
273 warn('Unable to log in')
b2e8bc1b 274 return False
e00eb564 275
b2e8bc1b
JMF
276 return True
277
30226342 278 def _download_webpage_handle(self, *args, **kwargs):
c1148516 279 query = kwargs.get('query', {}).copy()
c1148516 280 kwargs['query'] = query
30226342 281 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
8d81f3e3
S
282 *args, **compat_kwargs(kwargs))
283
5b0a6a80 284 def _get_yt_initial_data(self, video_id, webpage):
285 config = self._search_regex(
286 (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
287 r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
288 webpage, 'ytInitialData', default=None)
289 if config:
290 return self._parse_json(
291 uppercase_escape(config), video_id, fatal=False)
292
b2e8bc1b
JMF
293 def _real_initialize(self):
294 if self._downloader is None:
295 return
42939b61 296 self._set_language()
b2e8bc1b
JMF
297 if not self._login():
298 return
c5e8d7af 299
8bdd16b4 300 _DEFAULT_API_DATA = {
301 'context': {
302 'client': {
303 'clientName': 'WEB',
304 'clientVersion': '2.20201021.03.00',
305 }
306 },
307 }
8377574c 308
8bdd16b4 309 def _call_api(self, ep, query, video_id):
310 data = self._DEFAULT_API_DATA.copy()
311 data.update(query)
9833e7a0 312
8bdd16b4 313 response = self._download_json(
314 'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id,
315 note='Downloading API JSON', errnote='Unable to download API page',
316 data=json.dumps(data).encode('utf8'),
317 headers={'content-type': 'application/json'},
318 query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'})
c54f4aad 319
8bdd16b4 320 return response
061a75ed 321
8bdd16b4 322 def _extract_yt_initial_data(self, video_id, webpage):
323 return self._parse_json(
324 self._search_regex(
325 r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;',
326 webpage, 'yt initial data'),
327 video_id)
0c148415
S
328
329
360e1ca5 330class YoutubeIE(YoutubeBaseInfoExtractor):
78caa52a 331 IE_DESC = 'YouTube.com'
cb7dfeea 332 _VALID_URL = r"""(?x)^
c5e8d7af 333 (
edb53e2d 334 (?:https?://|//) # http(s):// or protocol-independent URL
66b48727 335 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
484aaeb2 336 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 337 (?:www\.)?pwnyoutube\.com/|
8b561bfc 338 (?:www\.)?hooktube\.com/|
f7000f3a 339 (?:www\.)?yourepeat\.com/|
e69ae5b9 340 tube\.majestyc\.net/|
ba036333 341 # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
77d95677 342 (?:(?:www|dev)\.)?invidio\.us/|
ba036333 343 (?:(?:www|no)\.)?invidiou\.sh/|
344 (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/|
8ae113ca 345 (?:www\.)?invidious\.kabi\.tk/|
ba036333 346 (?:www\.)?invidious\.13ad\.de/|
791d2e81 347 (?:www\.)?invidious\.mastodon\.host/|
494d664e 348 (?:www\.)?invidious\.nixnet\.xyz/|
666d808e 349 (?:www\.)?invidious\.drycat\.fr/|
ba036333 350 (?:www\.)?tube\.poal\.co/|
8ae113ca 351 (?:www\.)?vid\.wxzm\.sx/|
384bf91f 352 (?:www\.)?yewtu\.be/|
494d664e 353 (?:www\.)?yt\.elukerio\.org/|
894b3826 354 (?:www\.)?yt\.lelux\.fi/|
1db5ab6b 355 (?:www\.)?invidious\.ggc-project\.de/|
356 (?:www\.)?yt\.maisputain\.ovh/|
357 (?:www\.)?invidious\.13ad\.de/|
358 (?:www\.)?invidious\.toot\.koeln/|
359 (?:www\.)?invidious\.fdn\.fr/|
360 (?:www\.)?watch\.nettohikari\.com/|
bff90fc5 361 (?:www\.)?kgg2m7yk5aybusll\.onion/|
362 (?:www\.)?qklhadlycap4cnod\.onion/|
363 (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
364 (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
365 (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
366 (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
33c1c7d8 367 (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
1db5ab6b 368 (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
e69ae5b9 369 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
370 (?:.*?\#/)? # handle anchor (#/) redirect urls
371 (?: # the various things that can precede the ID:
ac7553d0 372 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 373 |(?: # or the v= param in all its forms
f7000f3a 374 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af 375 (?:\?|\#!?) # the params delimiter ? or # or #!
040ac686 376 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
c5e8d7af
PH
377 v=
378 )
f4b05232 379 ))
cbaed4bb
S
380 |(?:
381 youtu\.be| # just youtu.be/xxxx
6d4fc66b
S
382 vid\.plus| # or vid.plus/xxxx
383 zwearz\.com/watch| # or zwearz.com/watch/xxxx
cbaed4bb 384 )/
edb53e2d 385 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 386 )
c5e8d7af 387 )? # all until now is optional -> you can pass the naked ID
8bdd16b4 388 (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
d0ba5587
S
389 (?!.*?\blist=
390 (?:
391 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
392 WL # WL are handled by the watch later IE
393 )
394 )
c5e8d7af 395 (?(1).+)? # if we found the ID, everything can follow
d0ba5587 396 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
c5e8d7af 397 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
e40c758c
S
398 _PLAYER_INFO_RE = (
399 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P<ext>[a-z]+)$',
400 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.(?P<ext>[a-z]+)$',
401 )
2c62dc26 402 _formats = {
c2d3cb4c 403 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
404 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
405 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
406 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
407 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
408 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
409 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
410 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
3834d3e3 411 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
c2d3cb4c 412 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
413 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
414 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
415 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
416 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
417 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
e1a0bfdf 418 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
c2d3cb4c 419 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
420 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
e1a0bfdf 421
422
423 # 3D videos
c2d3cb4c 424 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
425 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
426 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
427 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
e1a0bfdf 428 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
429 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
430 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
836a086c 431
96fb5605 432 # Apple HTTP Live Streaming
11f12195 433 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
c2d3cb4c 434 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
435 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
436 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
437 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
438 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
e1a0bfdf 439 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
440 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
2c62dc26
PH
441
442 # DASH mp4 video
d23028a8
S
443 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
444 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
445 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
446 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
447 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
067aa17e 448 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
d23028a8
S
449 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
450 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
451 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
452 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
453 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
454 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
836a086c 455
f6f1fc92 456 # Dash mp4 audio
d23028a8
S
457 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
458 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
459 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
460 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
461 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
462 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
463 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
836a086c
AZ
464
465 # Dash webm
d23028a8
S
466 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
467 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
468 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
469 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
470 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
471 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
472 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
473 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
474 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
475 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
476 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
477 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
478 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
479 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
480 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
4c6b4764 481 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
d23028a8
S
482 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
483 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
484 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
485 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
486 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
487 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
2c62dc26
PH
488
489 # Dash webm audio
d23028a8
S
490 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
491 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
ce6b9a2d 492
0857baad 493 # Dash webm audio with opus inside
d23028a8
S
494 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
495 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
496 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
0857baad 497
ce6b9a2d
PH
498 # RTMP (unnamed)
499 '_rtmp': {'protocol': 'rtmp'},
b85eae0f
S
500
501 # av01 video only formats sometimes served with "unknown" codecs
502 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
503 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
504 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
505 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
c5e8d7af 506 }
84da5d84 507 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
836a086c 508
fd5c4aab
S
509 _GEO_BYPASS = False
510
78caa52a 511 IE_NAME = 'youtube'
2eb88d95
PH
512 _TESTS = [
513 {
2d3d2997 514 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
4bc3a23e
PH
515 'info_dict': {
516 'id': 'BaW_jenozKc',
517 'ext': 'mp4',
3867038a 518 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
4bc3a23e
PH
519 'uploader': 'Philipp Hagemeister',
520 'uploader_id': 'phihag',
ec85ded8 521 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
dd4c4492
S
522 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
523 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
4bc3a23e 524 'upload_date': '20121002',
3867038a 525 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
4bc3a23e 526 'categories': ['Science & Technology'],
3867038a 527 'tags': ['youtube-dl'],
556dbe7f 528 'duration': 10,
dbdaaa23 529 'view_count': int,
3e7c1224
PH
530 'like_count': int,
531 'dislike_count': int,
7c80519c 532 'start_time': 1,
297a564b 533 'end_time': 9,
2eb88d95 534 }
0e853ca4 535 },
fccd3771 536 {
4bc3a23e
PH
537 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
538 'note': 'Embed-only video (#1746)',
539 'info_dict': {
540 'id': 'yZIXLfi8CZQ',
541 'ext': 'mp4',
542 'upload_date': '20120608',
543 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
544 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
545 'uploader': 'SET India',
94bfcd23 546 'uploader_id': 'setindia',
ec85ded8 547 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
94bfcd23 548 'age_limit': 18,
fccd3771
PH
549 }
550 },
11b56058 551 {
8bdd16b4 552 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ',
11b56058
PM
553 'note': 'Use the first video ID in the URL',
554 'info_dict': {
555 'id': 'BaW_jenozKc',
556 'ext': 'mp4',
3867038a 557 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
11b56058
PM
558 'uploader': 'Philipp Hagemeister',
559 'uploader_id': 'phihag',
ec85ded8 560 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
11b56058 561 'upload_date': '20121002',
3867038a 562 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
11b56058 563 'categories': ['Science & Technology'],
3867038a 564 'tags': ['youtube-dl'],
556dbe7f 565 'duration': 10,
dbdaaa23 566 'view_count': int,
11b56058
PM
567 'like_count': int,
568 'dislike_count': int,
34a7de29
S
569 },
570 'params': {
571 'skip_download': True,
572 },
11b56058 573 },
dd27fd17 574 {
2d3d2997 575 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
4bc3a23e
PH
576 'note': '256k DASH audio (format 141) via DASH manifest',
577 'info_dict': {
578 'id': 'a9LDPn-MO4I',
579 'ext': 'm4a',
580 'upload_date': '20121002',
581 'uploader_id': '8KVIDEO',
ec85ded8 582 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
4bc3a23e
PH
583 'description': '',
584 'uploader': '8KVIDEO',
585 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 586 },
4bc3a23e
PH
587 'params': {
588 'youtube_include_dash_manifest': True,
589 'format': '141',
4919603f 590 },
de3c7fe0 591 'skip': 'format 141 not served anymore',
dd27fd17 592 },
8bdd16b4 593 # DASH manifest with encrypted signature
594 {
595 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
596 'info_dict': {
597 'id': 'IB3lcPjvWLA',
598 'ext': 'm4a',
599 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
600 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
601 'duration': 244,
602 'uploader': 'AfrojackVEVO',
603 'uploader_id': 'AfrojackVEVO',
604 'upload_date': '20131011',
605 },
606 'params': {
607 'youtube_include_dash_manifest': True,
608 'format': '141/bestaudio[ext=m4a]',
609 },
610 },
aa79ac0c
PH
611 # Controversy video
612 {
613 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
614 'info_dict': {
615 'id': 'T4XJQO3qol8',
616 'ext': 'mp4',
556dbe7f 617 'duration': 219,
aa79ac0c 618 'upload_date': '20100909',
4fe54c12 619 'uploader': 'Amazing Atheist',
aa79ac0c 620 'uploader_id': 'TheAmazingAtheist',
ec85ded8 621 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
aa79ac0c
PH
622 'title': 'Burning Everyone\'s Koran',
623 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
624 }
c522adb1 625 },
dd2d55f1 626 # Normal age-gate video (embed allowed)
c522adb1 627 {
2d3d2997 628 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
c522adb1
JMF
629 'info_dict': {
630 'id': 'HtVdAasjOgU',
631 'ext': 'mp4',
632 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
ec85ded8 633 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
556dbe7f 634 'duration': 142,
c522adb1
JMF
635 'uploader': 'The Witcher',
636 'uploader_id': 'WitcherGame',
ec85ded8 637 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
c522adb1 638 'upload_date': '20140605',
34952f09 639 'age_limit': 18,
c522adb1
JMF
640 },
641 },
8bdd16b4 642 # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
643 # YouTube Red ad is not captured for creator
644 {
645 'url': '__2ABJjxzNo',
646 'info_dict': {
647 'id': '__2ABJjxzNo',
648 'ext': 'mp4',
649 'duration': 266,
650 'upload_date': '20100430',
651 'uploader_id': 'deadmau5',
652 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
653 'creator': 'Dada Life, deadmau5',
654 'description': 'md5:12c56784b8032162bb936a5f76d55360',
655 'uploader': 'deadmau5',
656 'title': 'Deadmau5 - Some Chords (HD)',
657 'alt_title': 'This Machine Kills Some Chords',
658 },
659 'expected_warnings': [
660 'DASH manifest missing',
661 ]
662 },
067aa17e 663 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
e52a40ab
PH
664 {
665 'url': 'lqQg6PlCWgI',
666 'info_dict': {
667 'id': 'lqQg6PlCWgI',
668 'ext': 'mp4',
556dbe7f 669 'duration': 6085,
90227264 670 'upload_date': '20150827',
cbe2bd91 671 'uploader_id': 'olympic',
ec85ded8 672 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
cbe2bd91 673 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
be49068d 674 'uploader': 'Olympic',
cbe2bd91
PH
675 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
676 },
677 'params': {
678 'skip_download': 'requires avconv',
e52a40ab 679 }
cbe2bd91 680 },
6271f1ca
PH
681 # Non-square pixels
682 {
683 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
684 'info_dict': {
685 'id': '_b-2C3KPAM0',
686 'ext': 'mp4',
687 'stretched_ratio': 16 / 9.,
556dbe7f 688 'duration': 85,
6271f1ca
PH
689 'upload_date': '20110310',
690 'uploader_id': 'AllenMeow',
ec85ded8 691 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
6271f1ca 692 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
eb6793ba 693 'uploader': '孫ᄋᄅ',
6271f1ca
PH
694 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
695 },
06b491eb
S
696 },
697 # url_encoded_fmt_stream_map is empty string
698 {
699 'url': 'qEJwOuvDf7I',
700 'info_dict': {
701 'id': 'qEJwOuvDf7I',
f57b7835 702 'ext': 'webm',
06b491eb
S
703 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
704 'description': '',
705 'upload_date': '20150404',
706 'uploader_id': 'spbelect',
707 'uploader': 'Наблюдатели Петербурга',
708 },
709 'params': {
710 'skip_download': 'requires avconv',
e323cf3f
S
711 },
712 'skip': 'This live event has ended.',
06b491eb 713 },
067aa17e 714 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
da77d856
S
715 {
716 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
717 'info_dict': {
718 'id': 'FIl7x6_3R5Y',
eb6793ba 719 'ext': 'webm',
da77d856
S
720 'title': 'md5:7b81415841e02ecd4313668cde88737a',
721 'description': 'md5:116377fd2963b81ec4ce64b542173306',
556dbe7f 722 'duration': 220,
da77d856
S
723 'upload_date': '20150625',
724 'uploader_id': 'dorappi2000',
ec85ded8 725 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
da77d856 726 'uploader': 'dorappi2000',
eb6793ba 727 'formats': 'mincount:31',
da77d856 728 },
eb6793ba 729 'skip': 'not actual anymore',
2ee8f5d8 730 },
8a1a26ce
YCH
731 # DASH manifest with segment_list
732 {
733 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
734 'md5': '8ce563a1d667b599d21064e982ab9e31',
735 'info_dict': {
736 'id': 'CsmdDsKjzN8',
737 'ext': 'mp4',
17ee98e1 738 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
8a1a26ce
YCH
739 'uploader': 'Airtek',
740 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
741 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
742 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
743 },
744 'params': {
745 'youtube_include_dash_manifest': True,
746 'format': '135', # bestvideo
be49068d
S
747 },
748 'skip': 'This live event has ended.',
2ee8f5d8 749 },
cf7e015f
S
750 {
751 # Multifeed videos (multiple cameras), URL is for Main Camera
752 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
753 'info_dict': {
754 'id': 'jqWvoWXjCVs',
755 'title': 'teamPGP: Rocket League Noob Stream',
756 'description': 'md5:dc7872fb300e143831327f1bae3af010',
757 },
758 'playlist': [{
759 'info_dict': {
760 'id': 'jqWvoWXjCVs',
761 'ext': 'mp4',
762 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
763 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 764 'duration': 7335,
cf7e015f
S
765 'upload_date': '20150721',
766 'uploader': 'Beer Games Beer',
767 'uploader_id': 'beergamesbeer',
ec85ded8 768 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 769 'license': 'Standard YouTube License',
cf7e015f
S
770 },
771 }, {
772 'info_dict': {
773 'id': '6h8e8xoXJzg',
774 'ext': 'mp4',
775 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
776 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 777 'duration': 7337,
cf7e015f
S
778 'upload_date': '20150721',
779 'uploader': 'Beer Games Beer',
780 'uploader_id': 'beergamesbeer',
ec85ded8 781 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 782 'license': 'Standard YouTube License',
cf7e015f
S
783 },
784 }, {
785 'info_dict': {
786 'id': 'PUOgX5z9xZw',
787 'ext': 'mp4',
788 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
789 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 790 'duration': 7337,
cf7e015f
S
791 'upload_date': '20150721',
792 'uploader': 'Beer Games Beer',
793 'uploader_id': 'beergamesbeer',
ec85ded8 794 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 795 'license': 'Standard YouTube License',
cf7e015f
S
796 },
797 }, {
798 'info_dict': {
799 'id': 'teuwxikvS5k',
800 'ext': 'mp4',
801 'title': 'teamPGP: Rocket League Noob Stream (zim)',
802 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 803 'duration': 7334,
cf7e015f
S
804 'upload_date': '20150721',
805 'uploader': 'Beer Games Beer',
806 'uploader_id': 'beergamesbeer',
ec85ded8 807 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 808 'license': 'Standard YouTube License',
cf7e015f
S
809 },
810 }],
811 'params': {
812 'skip_download': True,
813 },
4fe54c12 814 'skip': 'This video is not available.',
cbaed4bb 815 },
f9f49d87 816 {
067aa17e 817 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
f9f49d87
S
818 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
819 'info_dict': {
820 'id': 'gVfLd0zydlo',
821 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
822 },
823 'playlist_count': 2,
be49068d 824 'skip': 'Not multifeed anymore',
f9f49d87 825 },
cbaed4bb 826 {
2d3d2997 827 'url': 'https://vid.plus/FlRa-iH7PGw',
cbaed4bb 828 'only_matching': True,
0e49d9a6 829 },
6d4fc66b 830 {
2d3d2997 831 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
6d4fc66b
S
832 'only_matching': True,
833 },
0e49d9a6 834 {
067aa17e 835 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
a8776b10 836 # Also tests cut-off URL expansion in video description (see
067aa17e
S
837 # https://github.com/ytdl-org/youtube-dl/issues/1892,
838 # https://github.com/ytdl-org/youtube-dl/issues/8164)
0e49d9a6
LL
839 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
840 'info_dict': {
841 'id': 'lsguqyKfVQg',
842 'ext': 'mp4',
843 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
eb6793ba 844 'alt_title': 'Dark Walk - Position Music',
0e49d9a6 845 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
556dbe7f 846 'duration': 133,
0e49d9a6
LL
847 'upload_date': '20151119',
848 'uploader_id': 'IronSoulElf',
ec85ded8 849 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
0e49d9a6 850 'uploader': 'IronSoulElf',
eb6793ba
S
851 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
852 'track': 'Dark Walk - Position Music',
853 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
92bc97d3 854 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
0e49d9a6
LL
855 },
856 'params': {
857 'skip_download': True,
858 },
859 },
61f92af1 860 {
067aa17e 861 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
61f92af1
S
862 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
863 'only_matching': True,
864 },
313dfc45
LL
865 {
866 # Video with yt:stretch=17:0
867 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
868 'info_dict': {
869 'id': 'Q39EVAstoRM',
870 'ext': 'mp4',
871 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
872 'description': 'md5:ee18a25c350637c8faff806845bddee9',
873 'upload_date': '20151107',
874 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
875 'uploader': 'CH GAMER DROID',
876 },
877 'params': {
878 'skip_download': True,
879 },
be49068d 880 'skip': 'This video does not exist.',
313dfc45 881 },
7caf9830
S
882 {
883 # Video licensed under Creative Commons
884 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
885 'info_dict': {
886 'id': 'M4gD1WSo5mA',
887 'ext': 'mp4',
888 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
889 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
556dbe7f 890 'duration': 721,
7caf9830
S
891 'upload_date': '20150127',
892 'uploader_id': 'BerkmanCenter',
ec85ded8 893 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
556dbe7f 894 'uploader': 'The Berkman Klein Center for Internet & Society',
7caf9830
S
895 'license': 'Creative Commons Attribution license (reuse allowed)',
896 },
897 'params': {
898 'skip_download': True,
899 },
900 },
fd050249
S
901 {
902 # Channel-like uploader_url
903 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
904 'info_dict': {
905 'id': 'eQcmzGIKrzg',
906 'ext': 'mp4',
907 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
908 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
556dbe7f 909 'duration': 4060,
fd050249 910 'upload_date': '20151119',
eb6793ba 911 'uploader': 'Bernie Sanders',
fd050249 912 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
ec85ded8 913 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
fd050249
S
914 'license': 'Creative Commons Attribution license (reuse allowed)',
915 },
916 'params': {
917 'skip_download': True,
918 },
919 },
040ac686
S
920 {
921 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
922 'only_matching': True,
7f29cf54
S
923 },
924 {
067aa17e 925 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
7f29cf54
S
926 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
927 'only_matching': True,
6496ccb4
S
928 },
929 {
930 # Rental video preview
931 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
932 'info_dict': {
933 'id': 'uGpuVWrhIzE',
934 'ext': 'mp4',
935 'title': 'Piku - Trailer',
936 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
937 'upload_date': '20150811',
938 'uploader': 'FlixMatrix',
939 'uploader_id': 'FlixMatrixKaravan',
ec85ded8 940 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
6496ccb4
S
941 'license': 'Standard YouTube License',
942 },
943 'params': {
944 'skip_download': True,
945 },
eb6793ba 946 'skip': 'This video is not available.',
022a5d66 947 },
12afdc2a
S
948 {
949 # YouTube Red video with episode data
950 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
951 'info_dict': {
952 'id': 'iqKdEhx-dD4',
953 'ext': 'mp4',
954 'title': 'Isolation - Mind Field (Ep 1)',
4fe54c12 955 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
556dbe7f 956 'duration': 2085,
12afdc2a
S
957 'upload_date': '20170118',
958 'uploader': 'Vsauce',
959 'uploader_id': 'Vsauce',
960 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
12afdc2a
S
961 'series': 'Mind Field',
962 'season_number': 1,
963 'episode_number': 1,
964 },
965 'params': {
966 'skip_download': True,
967 },
968 'expected_warnings': [
969 'Skipping DASH manifest',
970 ],
971 },
c7121fa7
S
972 {
973 # The following content has been identified by the YouTube community
974 # as inappropriate or offensive to some audiences.
975 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
976 'info_dict': {
977 'id': '6SJNVb0GnPI',
978 'ext': 'mp4',
979 'title': 'Race Differences in Intelligence',
980 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
981 'duration': 965,
982 'upload_date': '20140124',
983 'uploader': 'New Century Foundation',
984 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
985 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
c7121fa7
S
986 },
987 'params': {
988 'skip_download': True,
989 },
990 },
022a5d66
S
991 {
992 # itag 212
993 'url': '1t24XAntNCY',
994 'only_matching': True,
fd5c4aab
S
995 },
996 {
997 # geo restricted to JP
998 'url': 'sJL6WA-aGkQ',
999 'only_matching': True,
1000 },
cd5a74a2
S
1001 {
1002 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1003 'only_matching': True,
1004 },
825cd268
RA
1005 {
1006 # DRM protected
1007 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1008 'only_matching': True,
4fe54c12
S
1009 },
1010 {
1011 # Video with unsupported adaptive stream type formats
1012 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1013 'info_dict': {
1014 'id': 'Z4Vy8R84T1U',
1015 'ext': 'mp4',
1016 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1017 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1018 'duration': 433,
1019 'upload_date': '20130923',
1020 'uploader': 'Amelia Putri Harwita',
1021 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1022 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1023 'formats': 'maxcount:10',
1024 },
1025 'params': {
1026 'skip_download': True,
1027 'youtube_include_dash_manifest': False,
1028 },
5429d6a9 1029 'skip': 'not actual anymore',
5caabd3c 1030 },
1031 {
822b9d9c 1032 # Youtube Music Auto-generated description
5caabd3c 1033 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1034 'info_dict': {
1035 'id': 'MgNrAu2pzNs',
1036 'ext': 'mp4',
1037 'title': 'Voyeur Girl',
1038 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1039 'upload_date': '20190312',
5429d6a9
S
1040 'uploader': 'Stephen - Topic',
1041 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
5caabd3c 1042 'artist': 'Stephen',
1043 'track': 'Voyeur Girl',
1044 'album': 'it\'s too much love to know my dear',
1045 'release_date': '20190313',
1046 'release_year': 2019,
1047 },
1048 'params': {
1049 'skip_download': True,
1050 },
1051 },
66b48727
RA
1052 {
1053 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1054 'only_matching': True,
1055 },
011e75e6
S
1056 {
1057 # invalid -> valid video id redirection
1058 'url': 'DJztXj2GPfl',
1059 'info_dict': {
1060 'id': 'DJztXj2GPfk',
1061 'ext': 'mp4',
1062 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1063 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1064 'upload_date': '20090125',
1065 'uploader': 'Prochorowka',
1066 'uploader_id': 'Prochorowka',
1067 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1068 'artist': 'Panjabi MC',
1069 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1070 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1071 },
1072 'params': {
1073 'skip_download': True,
1074 },
ea74e00b
DP
1075 },
1076 {
1077 # empty description results in an empty string
1078 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1079 'info_dict': {
1080 'id': 'x41yOUIvK2k',
1081 'ext': 'mp4',
1082 'title': 'IMG 3456',
1083 'description': '',
1084 'upload_date': '20170613',
1085 'uploader_id': 'ElevageOrVert',
1086 'uploader': 'ElevageOrVert',
1087 },
1088 'params': {
1089 'skip_download': True,
1090 },
1091 },
2eb88d95
PH
1092 ]
1093
e0df6211
PH
1094 def __init__(self, *args, **kwargs):
1095 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 1096 self._player_cache = {}
e0df6211 1097
c5e8d7af
PH
1098 def report_video_info_webpage_download(self, video_id):
1099 """Report attempt to download video info webpage."""
69ea8ca4 1100 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 1101
c5e8d7af
PH
1102 def report_information_extraction(self, video_id):
1103 """Report attempt to extract video information."""
69ea8ca4 1104 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
1105
1106 def report_unavailable_format(self, video_id, format):
1107 """Report extracted video URL."""
69ea8ca4 1108 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
1109
1110 def report_rtmp_download(self):
1111 """Indicate the download will use the RTMP protocol."""
69ea8ca4 1112 self.to_screen('RTMP download detected')
c5e8d7af 1113
60064c53
PH
1114 def _signature_cache_id(self, example_sig):
1115 """ Return a string representation of a signature """
78caa52a 1116 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53 1117
e40c758c
S
1118 @classmethod
1119 def _extract_player_info(cls, player_url):
1120 for player_re in cls._PLAYER_INFO_RE:
1121 id_m = re.search(player_re, player_url)
1122 if id_m:
1123 break
1124 else:
c081b35c 1125 raise ExtractorError('Cannot identify player %r' % player_url)
e40c758c
S
1126 return id_m.group('ext'), id_m.group('id')
1127
1128 def _extract_signature_function(self, video_id, player_url, example_sig):
1129 player_type, player_id = self._extract_player_info(player_url)
e0df6211 1130
c4417ddb 1131 # Read from filesystem cache
60064c53
PH
1132 func_id = '%s_%s_%s' % (
1133 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 1134 assert os.path.basename(func_id) == func_id
a0e07d31 1135
69ea8ca4 1136 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 1137 if cache_spec is not None:
78caa52a 1138 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 1139
6d1a55a5
PH
1140 download_note = (
1141 'Downloading player %s' % player_url
1142 if self._downloader.params.get('verbose') else
1143 'Downloading %s player %s' % (player_type, player_id)
1144 )
e0df6211
PH
1145 if player_type == 'js':
1146 code = self._download_webpage(
1147 player_url, video_id,
6d1a55a5 1148 note=download_note,
69ea8ca4 1149 errnote='Download of %s failed' % player_url)
83799698 1150 res = self._parse_sig_js(code)
c4417ddb 1151 elif player_type == 'swf':
e0df6211
PH
1152 urlh = self._request_webpage(
1153 player_url, video_id,
6d1a55a5 1154 note=download_note,
69ea8ca4 1155 errnote='Download of %s failed' % player_url)
e0df6211 1156 code = urlh.read()
83799698 1157 res = self._parse_sig_swf(code)
e0df6211
PH
1158 else:
1159 assert False, 'Invalid player type %r' % player_type
1160
785521bf
PH
1161 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1162 cache_res = res(test_string)
1163 cache_spec = [ord(c) for c in cache_res]
83799698 1164
69ea8ca4 1165 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
1166 return res
1167
60064c53 1168 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
1169 def gen_sig_code(idxs):
1170 def _genslice(start, end, step):
78caa52a 1171 starts = '' if start == 0 else str(start)
8bcc8756 1172 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 1173 steps = '' if step == 1 else (':%d' % step)
78caa52a 1174 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
1175
1176 step = None
7af808a5
PH
1177 # Quelch pyflakes warnings - start will be set when step is set
1178 start = '(Never used)'
edf3e38e
PH
1179 for i, prev in zip(idxs[1:], idxs[:-1]):
1180 if step is not None:
1181 if i - prev == step:
1182 continue
1183 yield _genslice(start, prev, step)
1184 step = None
1185 continue
1186 if i - prev in [-1, 1]:
1187 step = i - prev
1188 start = prev
1189 continue
1190 else:
78caa52a 1191 yield 's[%d]' % prev
edf3e38e 1192 if step is None:
78caa52a 1193 yield 's[%d]' % i
edf3e38e
PH
1194 else:
1195 yield _genslice(start, i, step)
1196
78caa52a 1197 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 1198 cache_res = func(test_string)
edf3e38e 1199 cache_spec = [ord(c) for c in cache_res]
78caa52a 1200 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
1201 signature_id_tuple = '(%s)' % (
1202 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 1203 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 1204 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 1205 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 1206
e0df6211
PH
1207 def _parse_sig_js(self, jscode):
1208 funcname = self._search_regex(
abefc03f
S
1209 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1210 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
e450f6cb 1211 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
31ce6e99 1212 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
abefc03f
S
1213 # Obsolete patterns
1214 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
9a47fa35 1215 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
abefc03f
S
1216 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1217 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1218 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1219 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1220 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1221 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
3c90cc8b 1222 jscode, 'Initial JS player signature function name', group='sig')
2b25cb5d
PH
1223
1224 jsi = JSInterpreter(jscode)
1225 initial_function = jsi.extract_function(funcname)
e0df6211
PH
1226 return lambda s: initial_function([s])
1227
1228 def _parse_sig_swf(self, file_contents):
54256267 1229 swfi = SWFInterpreter(file_contents)
78caa52a 1230 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 1231 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 1232 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
1233 return lambda s: initial_function([s])
1234
83799698 1235 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 1236 """Turn the encrypted s field into a working signature"""
6b37f0be 1237
c8bf86d5 1238 if player_url is None:
69ea8ca4 1239 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 1240
69ea8ca4 1241 if player_url.startswith('//'):
78caa52a 1242 player_url = 'https:' + player_url
3c90cc8b
S
1243 elif not re.match(r'https?://', player_url):
1244 player_url = compat_urlparse.urljoin(
1245 'https://www.youtube.com', player_url)
c8bf86d5 1246 try:
62af3a0e 1247 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
1248 if player_id not in self._player_cache:
1249 func = self._extract_signature_function(
60064c53 1250 video_id, player_url, s
c8bf86d5
PH
1251 )
1252 self._player_cache[player_id] = func
1253 func = self._player_cache[player_id]
1254 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 1255 self._print_sig_code(func, s)
c8bf86d5
PH
1256 return func(s)
1257 except Exception as e:
1258 tb = traceback.format_exc()
1259 raise ExtractorError(
78caa52a 1260 'Signature extraction failed: ' + tb, cause=e)
e0df6211 1261
f96f5dda 1262 def _get_subtitles(self, video_id, webpage, has_live_chat_replay):
de7f3446 1263 try:
60e47a26 1264 subs_doc = self._download_xml(
38c2e5b8 1265 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
1266 video_id, note=False)
1267 except ExtractorError as err:
9b9c5355 1268 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
de7f3446 1269 return {}
de7f3446
JMF
1270
1271 sub_lang_list = {}
60e47a26
JMF
1272 for track in subs_doc.findall('track'):
1273 lang = track.attrib['lang_code']
7e660ac1
LD
1274 if lang in sub_lang_list:
1275 continue
360e1ca5 1276 sub_formats = []
23d17e4b 1277 for ext in self._SUBTITLE_FORMATS:
15707c7e 1278 params = compat_urllib_parse_urlencode({
360e1ca5
JMF
1279 'lang': lang,
1280 'v': video_id,
1281 'fmt': ext,
1282 'name': track.attrib['name'].encode('utf-8'),
1283 })
1284 sub_formats.append({
1285 'url': 'https://www.youtube.com/api/timedtext?' + params,
1286 'ext': ext,
1287 })
1288 sub_lang_list[lang] = sub_formats
9f448fcb 1289 if has_live_chat_replay:
321bf820 1290 sub_lang_list['live_chat'] = [
1291 {
1292 'video_id': video_id,
1293 'ext': 'json',
1294 'protocol': 'youtube_live_chat_replay',
1295 },
9f448fcb 1296 ]
de7f3446 1297 if not sub_lang_list:
69ea8ca4 1298 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
1299 return {}
1300 return sub_lang_list
1301
a72778d3
S
1302 def _get_ytplayer_config(self, video_id, webpage):
1303 patterns = (
526b3b07
S
1304 # User data may contain arbitrary character sequences that may affect
1305 # JSON extraction with regex, e.g. when '};' is contained the second
1306 # regex won't capture the whole JSON. Yet working around by trying more
1307 # concrete regex first keeping in mind proper quoted string handling
1308 # to be implemented in future that will replace this workaround (see
067aa17e
S
1309 # https://github.com/ytdl-org/youtube-dl/issues/7468,
1310 # https://github.com/ytdl-org/youtube-dl/pull/7599)
a72778d3
S
1311 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1312 r';ytplayer\.config\s*=\s*({.+?});',
8bdd16b4 1313 r'ytInitialPlayerResponse\s*=\s*({.+?});var meta' # Needed???
a72778d3
S
1314 )
1315 config = self._search_regex(
1316 patterns, webpage, 'ytplayer.config', default=None)
1317 if config:
1318 return self._parse_json(
1319 uppercase_escape(config), video_id, fatal=False)
0e49d9a6 1320
9322f116 1321 def _get_music_metadata_from_yt_initial(self, yt_initial):
1322 music_metadata = []
1323 key_map = {
1324 'Album': 'album',
1325 'Artist': 'artist',
1326 'Song': 'track'
1327 }
1328 contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'])
1329 if type(contents) is list:
1330 for content in contents:
1331 music_track = {}
1332 if type(content) is not dict:
1333 continue
1334 videoSecondaryInfoRenderer = try_get(content, lambda x: x['videoSecondaryInfoRenderer'])
1335 if type(videoSecondaryInfoRenderer) is not dict:
1336 continue
1337 rows = try_get(videoSecondaryInfoRenderer, lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'])
1338 if type(rows) is not list:
1339 continue
1340 for row in rows:
1341 metadataRowRenderer = try_get(row, lambda x: x['metadataRowRenderer'])
1342 if type(metadataRowRenderer) is not dict:
1343 continue
1344 key = try_get(metadataRowRenderer, lambda x: x['title']['simpleText'])
1345 value = try_get(metadataRowRenderer, lambda x: x['contents'][0]['simpleText']) or \
1346 try_get(metadataRowRenderer, lambda x: x['contents'][0]['runs'][0]['text'])
1347 if type(key) is not str or type(value) is not str:
1348 continue
1349 if key in key_map:
1350 if key_map[key] in music_track:
1351 # we've started on a new track
1352 music_metadata.append(music_track)
1353 music_track = {}
1354 music_track[key_map[key]] = value
1355 if len(music_track.keys()):
1356 music_metadata.append(music_track)
1357 return music_metadata
1358
360e1ca5 1359 def _get_automatic_captions(self, video_id, webpage):
de7f3446
JMF
1360 """We need the webpage for getting the captions url, pass it as an
1361 argument to speed up the process."""
69ea8ca4 1362 self.to_screen('%s: Looking for automatic captions' % video_id)
a72778d3 1363 player_config = self._get_ytplayer_config(video_id, webpage)
78caa52a 1364 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
a72778d3 1365 if not player_config:
de7f3446
JMF
1366 self._downloader.report_warning(err_msg)
1367 return {}
de7f3446 1368 try:
8bdd16b4 1369 args = player_config['args']
1370 caption_url = args.get('ttsurl')
1371 if caption_url:
b78b292f
S
1372 timestamp = args['timestamp']
1373 # We get the available subtitles
15707c7e 1374 list_params = compat_urllib_parse_urlencode({
b78b292f
S
1375 'type': 'list',
1376 'tlangs': 1,
1377 'asrs': 1,
1378 })
1379 list_url = caption_url + '&' + list_params
1380 caption_list = self._download_xml(list_url, video_id)
1381 original_lang_node = caption_list.find('track')
1382 if original_lang_node is None:
1383 self._downloader.report_warning('Video doesn\'t have automatic captions')
1384 return {}
1385 original_lang = original_lang_node.attrib['lang_code']
1386 caption_kind = original_lang_node.attrib.get('kind', '')
1387
1388 sub_lang_list = {}
1389 for lang_node in caption_list.findall('target'):
1390 sub_lang = lang_node.attrib['lang_code']
1391 sub_formats = []
1392 for ext in self._SUBTITLE_FORMATS:
15707c7e 1393 params = compat_urllib_parse_urlencode({
b78b292f
S
1394 'lang': original_lang,
1395 'tlang': sub_lang,
1396 'fmt': ext,
1397 'ts': timestamp,
1398 'kind': caption_kind,
1399 })
1400 sub_formats.append({
1401 'url': caption_url + '&' + params,
1402 'ext': ext,
1403 })
1404 sub_lang_list[sub_lang] = sub_formats
1405 return sub_lang_list
1406
ddbb4c5c
S
1407 def make_captions(sub_url, sub_langs):
1408 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1409 caption_qs = compat_parse_qs(parsed_sub_url.query)
1410 captions = {}
1411 for sub_lang in sub_langs:
1412 sub_formats = []
1413 for ext in self._SUBTITLE_FORMATS:
1414 caption_qs.update({
1415 'tlang': [sub_lang],
1416 'fmt': [ext],
1417 })
1418 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1419 query=compat_urllib_parse_urlencode(caption_qs, True)))
1420 sub_formats.append({
1421 'url': sub_url,
1422 'ext': ext,
1423 })
1424 captions[sub_lang] = sub_formats
1425 return captions
1426
1427 # New captions format as of 22.06.2017
8bdd16b4 1428 player_response = args.get('player_response')
1429 if player_response and isinstance(player_response, compat_str):
1430 player_response = self._parse_json(
1431 player_response, video_id, fatal=False)
1432 if player_response:
1433 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1434 base_url = renderer['captionTracks'][0]['baseUrl']
59c5fa91
PO
1435 sub_lang_list = []
1436 for lang in renderer['translationLanguages']:
1437 lang_code = lang.get('languageCode')
1438 if lang_code:
1439 sub_lang_list.append(lang_code)
1440 return make_captions(base_url, sub_lang_list)
1441
8bdd16b4 1442 # Some videos don't provide ttsurl but rather caption_tracks and
1443 # caption_translation_languages (e.g. 20LmZk1hakA)
1444 # Does not used anymore as of 22.06.2017
1445 caption_tracks = args['caption_tracks']
1446 caption_translation_languages = args['caption_translation_languages']
1447 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
1448 sub_lang_list = []
1449 for lang in caption_translation_languages.split(','):
1450 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1451 sub_lang = lang_qs.get('lc', [None])[0]
1452 if sub_lang:
1453 sub_lang_list.append(sub_lang)
1454 return make_captions(caption_url, sub_lang_list)
de7f3446
JMF
1455 # An extractor error can be raise by the download process if there are
1456 # no automatic captions but there are subtitles
ddbb4c5c 1457 except (KeyError, IndexError, ExtractorError):
de7f3446
JMF
1458 self._downloader.report_warning(err_msg)
1459 return {}
1460
21c340b8
S
1461 def _mark_watched(self, video_id, video_info, player_response):
1462 playback_url = url_or_none(try_get(
1463 player_response,
1464 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
1465 video_info, lambda x: x['videostats_playback_base_url'][0]))
d77ab8e2
S
1466 if not playback_url:
1467 return
1468 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1469 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1470
1471 # cpn generation algorithm is reverse engineered from base.js.
1472 # In fact it works even with dummy cpn.
1473 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1474 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1475
1476 qs.update({
1477 'ver': ['2'],
1478 'cpn': [cpn],
1479 })
1480 playback_url = compat_urlparse.urlunparse(
15707c7e 1481 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
d77ab8e2
S
1482
1483 self._download_webpage(
1484 playback_url, video_id, 'Marking watched',
1485 'Unable to mark watched', fatal=False)
1486
66c9fa36
S
1487 @staticmethod
1488 def _extract_urls(webpage):
1489 # Embedded YouTube player
1490 entries = [
1491 unescapeHTML(mobj.group('url'))
1492 for mobj in re.finditer(r'''(?x)
1493 (?:
1494 <iframe[^>]+?src=|
1495 data-video-url=|
1496 <embed[^>]+?src=|
1497 embedSWF\(?:\s*|
1498 <object[^>]+data=|
1499 new\s+SWFObject\(
1500 )
1501 (["\'])
1502 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
f2332f18 1503 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
66c9fa36
S
1504 \1''', webpage)]
1505
1506 # lazyYT YouTube embed
1507 entries.extend(list(map(
1508 unescapeHTML,
1509 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1510
1511 # Wordpress "YouTube Video Importer" plugin
1512 matches = re.findall(r'''(?x)<div[^>]+
1513 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1514 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1515 entries.extend(m[-1] for m in matches)
1516
1517 return entries
1518
1519 @staticmethod
1520 def _extract_url(webpage):
1521 urls = YoutubeIE._extract_urls(webpage)
1522 return urls[0] if urls else None
1523
97665381
PH
1524 @classmethod
1525 def extract_id(cls, url):
1526 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 1527 if mobj is None:
69ea8ca4 1528 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
1529 video_id = mobj.group(2)
1530 return video_id
1531
84213ea8
S
1532 def _extract_chapters_from_json(self, webpage, video_id, duration):
1533 if not webpage:
1534 return
8bdd16b4 1535 data = self._extract_yt_initial_data(video_id, webpage)
1536 if not data or not isinstance(data, dict):
84213ea8
S
1537 return
1538 chapters_list = try_get(
8bdd16b4 1539 data,
84213ea8
S
1540 lambda x: x['playerOverlays']
1541 ['playerOverlayRenderer']
1542 ['decoratedPlayerBarRenderer']
1543 ['decoratedPlayerBarRenderer']
1544 ['playerBar']
1545 ['chapteredPlayerBarRenderer']
1546 ['chapters'],
1547 list)
1548 if not chapters_list:
1549 return
1550
1551 def chapter_time(chapter):
1552 return float_or_none(
1553 try_get(
1554 chapter,
1555 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1556 int),
1557 scale=1000)
1558 chapters = []
1559 for next_num, chapter in enumerate(chapters_list, start=1):
1560 start_time = chapter_time(chapter)
1561 if start_time is None:
1562 continue
1563 end_time = (chapter_time(chapters_list[next_num])
1564 if next_num < len(chapters_list) else duration)
1565 if end_time is None:
1566 continue
1567 title = try_get(
1568 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1569 compat_str)
1570 chapters.append({
1571 'start_time': start_time,
1572 'end_time': end_time,
1573 'title': title,
1574 })
1575 return chapters
1576
9cafc3fd 1577 @staticmethod
84213ea8 1578 def _extract_chapters_from_description(description, duration):
9cafc3fd
S
1579 if not description:
1580 return None
1581 chapter_lines = re.findall(
1582 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1583 description)
1584 if not chapter_lines:
1585 return None
1586 chapters = []
1587 for next_num, (chapter_line, time_point) in enumerate(
1588 chapter_lines, start=1):
1589 start_time = parse_duration(time_point)
1590 if start_time is None:
1591 continue
39d4c1be
S
1592 if start_time > duration:
1593 break
9cafc3fd
S
1594 end_time = (duration if next_num == len(chapter_lines)
1595 else parse_duration(chapter_lines[next_num][1]))
1596 if end_time is None:
1597 continue
39d4c1be
S
1598 if end_time > duration:
1599 end_time = duration
1600 if start_time > end_time:
1601 break
9cafc3fd
S
1602 chapter_title = re.sub(
1603 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1604 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1605 chapters.append({
1606 'start_time': start_time,
1607 'end_time': end_time,
1608 'title': chapter_title,
1609 })
1610 return chapters
1611
84213ea8
S
1612 def _extract_chapters(self, webpage, description, video_id, duration):
1613 return (self._extract_chapters_from_json(webpage, video_id, duration)
1614 or self._extract_chapters_from_description(description, duration))
1615
c5e8d7af 1616 def _real_extract(self, url):
cf7e015f
S
1617 url, smuggled_data = unsmuggle_url(url, {})
1618
7e8c0af0 1619 proto = (
78caa52a
PH
1620 'http' if self._downloader.params.get('prefer_insecure', False)
1621 else 'https')
7e8c0af0 1622
7c80519c 1623 start_time = None
297a564b 1624 end_time = None
7c80519c
JMF
1625 parsed_url = compat_urllib_parse_urlparse(url)
1626 for component in [parsed_url.fragment, parsed_url.query]:
1627 query = compat_parse_qs(component)
297a564b 1628 if start_time is None and 't' in query:
7c80519c 1629 start_time = parse_duration(query['t'][0])
2929fa0e
JMF
1630 if start_time is None and 'start' in query:
1631 start_time = parse_duration(query['start'][0])
297a564b
JMF
1632 if end_time is None and 'end' in query:
1633 end_time = parse_duration(query['end'][0])
7c80519c 1634
c5e8d7af
PH
1635 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1636 mobj = re.search(self._NEXT_URL_RE, url)
1637 if mobj:
7fd002c0 1638 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
97665381 1639 video_id = self.extract_id(url)
c5e8d7af
PH
1640
1641 # Get video webpage
aa79ac0c 1642 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
011e75e6
S
1643 video_webpage, urlh = self._download_webpage_handle(url, video_id)
1644
1645 qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query)
1646 video_id = qs.get('v', [None])[0] or video_id
c5e8d7af
PH
1647
1648 # Attempt to extract SWF player URL
e0df6211 1649 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1650 if mobj is not None:
1651 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1652 else:
1653 player_url = None
1654
d8d24a92
S
1655 dash_mpds = []
1656
1657 def add_dash_mpd(video_info):
1658 dash_mpd = video_info.get('dashmpd')
1659 if dash_mpd and dash_mpd[0] not in dash_mpds:
1660 dash_mpds.append(dash_mpd[0])
1661
561b456e
S
1662 def add_dash_mpd_pr(pl_response):
1663 dash_mpd = url_or_none(try_get(
1664 pl_response, lambda x: x['streamingData']['dashManifestUrl'],
1665 compat_str))
1666 if dash_mpd and dash_mpd not in dash_mpds:
1667 dash_mpds.append(dash_mpd)
1668
c7121fa7
S
1669 is_live = None
1670 view_count = None
1671
1672 def extract_view_count(v_info):
1673 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1674
c2d125d9
S
1675 def extract_player_response(player_response, video_id):
1676 pl_response = str_or_none(player_response)
1677 if not pl_response:
1678 return
1679 pl_response = self._parse_json(pl_response, video_id, fatal=False)
1680 if isinstance(pl_response, dict):
1681 add_dash_mpd_pr(pl_response)
1682 return pl_response
1683
fb2c9277
U
1684 def extract_embedded_config(embed_webpage, video_id):
1685 embedded_config = self._search_regex(
1686 r'setConfig\(({.*})\);',
1687 embed_webpage, 'ytInitialData', default=None)
1688 if embedded_config:
1689 return embedded_config
1690
dbdaaa23
S
1691 player_response = {}
1692
c5e8d7af 1693 # Get video info
43ebf77d 1694 video_info = {}
6449cd80 1695 embed_webpage = None
39e7107d
U
1696 if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+'
1697 or re.search(r'player-age-gate-content">', video_webpage) is not None):
9d9314cb 1698 cookie_keys = self._get_cookies('https://www.youtube.com').keys()
c108eb73
JMF
1699 age_gate = True
1700 # We simulate the access to the video from www.youtube.com/v/{video_id}
1701 # this can be viewed without login into Youtube
beb95e77
CL
1702 url = proto + '://www.youtube.com/embed/%s' % video_id
1703 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
fb2c9277
U
1704 ext = extract_embedded_config(embed_webpage, video_id)
1705 # playabilityStatus = re.search(r'{\\\"status\\\":\\\"(?P<playabilityStatus>[^\"]+)\\\"', ext)
1706 playable_in_embed = re.search(r'{\\\"playableInEmbed\\\":(?P<playableinEmbed>[^\,]+)', ext)
1707 if not playable_in_embed:
1708 self.to_screen('Could not determine whether playabale in embed for video %s' % video_id)
1709 playable_in_embed = ''
1710 else:
1711 playable_in_embed = playable_in_embed.group('playableinEmbed')
1712 # check if video is only playable on youtube in other words not playable in embed - if so it requires auth (cookies)
1713 # if re.search(r'player-unavailable">', embed_webpage) is not None:
1714 if playable_in_embed == 'false':
c73baf23
U
1715 '''
1716 # TODO apply this patch when Support for Python 2.6(!) and above drops
9d9314cb 1717 if ({'VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID'} <= cookie_keys
4bb9c880 1718 or {'VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO'} <= cookie_keys):
c73baf23
U
1719 '''
1720 if (set(('VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID')) <= set(cookie_keys)
1721 or set(('VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO')) <= set(cookie_keys)):
4bb9c880
U
1722 age_gate = False
1723 # Try looking directly into the video webpage
1724 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1725 if ytplayer_config:
59c5fa91
PO
1726 args = ytplayer_config.get("args")
1727 if args is not None:
1728 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1729 # Convert to the same format returned by compat_parse_qs
1730 video_info = dict((k, [v]) for k, v in args.items())
1731 add_dash_mpd(video_info)
1732 # Rental video is not rented but preview is available (e.g.
1733 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1734 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1735 if not video_info and args.get('ypc_vid'):
1736 return self.url_result(
1737 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1738 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1739 is_live = True
1740 if not player_response:
1741 player_response = extract_player_response(args.get('player_response'), video_id)
1742 elif not player_response:
1743 player_response = ytplayer_config
4bb9c880
U
1744 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1745 add_dash_mpd_pr(player_response)
9d9314cb
U
1746 else:
1747 raise ExtractorError('Video is age restricted and only playable on Youtube. Requires cookies!', expected=True)
1748 else:
1749 data = compat_urllib_parse_urlencode({
1750 'video_id': video_id,
1751 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1752 'sts': self._search_regex(
1753 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1754 })
1755 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1756 try:
1757 video_info_webpage = self._download_webpage(
1758 video_info_url, video_id,
1759 note='Refetching age-gated info webpage',
1760 errnote='unable to download video info webpage')
1761 except ExtractorError:
1762 video_info_webpage = None
1763 if video_info_webpage:
1764 video_info = compat_parse_qs(video_info_webpage)
1765 pl_response = video_info.get('player_response', [None])[0]
1766 player_response = extract_player_response(pl_response, video_id)
1767 add_dash_mpd(video_info)
1768 view_count = extract_view_count(video_info)
c108eb73
JMF
1769 else:
1770 age_gate = False
d8d24a92 1771 # Try looking directly into the video webpage
a72778d3 1772 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
8bdd16b4 1773 if ytplayer_config:
1774 args = ytplayer_config.get('args', {})
4c76aa06 1775 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
d8d24a92
S
1776 # Convert to the same format returned by compat_parse_qs
1777 video_info = dict((k, [v]) for k, v in args.items())
1778 add_dash_mpd(video_info)
6496ccb4
S
1779 # Rental video is not rented but preview is available (e.g.
1780 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
067aa17e 1781 # https://github.com/ytdl-org/youtube-dl/issues/10532)
6496ccb4
S
1782 if not video_info and args.get('ypc_vid'):
1783 return self.url_result(
1784 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
2fe1ff85
JMF
1785 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1786 is_live = True
dbdaaa23 1787 if not player_response:
c2d125d9 1788 player_response = extract_player_response(args.get('player_response'), video_id)
0a3cf9ad 1789 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
561b456e 1790 add_dash_mpd_pr(player_response)
bbb7c3f7 1791
8bdd16b4 1792 if not video_info and not player_response:
1793 player_response = extract_player_response(
1794 self._search_regex(
1795 r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;', video_webpage,
1796 'initial player response', default='{}'),
1797 video_id)
1798
bbb7c3f7 1799 def extract_unavailable_message():
0add33ab
S
1800 messages = []
1801 for tag, kind in (('h1', 'message'), ('div', 'submessage')):
1802 msg = self._html_search_regex(
1803 r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
1804 video_webpage, 'unavailable %s' % kind, default=None)
1805 if msg:
1806 messages.append(msg)
1807 if messages:
1808 return '\n'.join(messages)
bbb7c3f7 1809
f93abcf1 1810 if not video_info and not player_response:
15be3eb5
RA
1811 unavailable_message = extract_unavailable_message()
1812 if not unavailable_message:
1813 unavailable_message = 'Unable to extract video data'
1814 raise ExtractorError(
1815 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
1816
f93abcf1
S
1817 if not isinstance(video_info, dict):
1818 video_info = {}
1819
dbdaaa23
S
1820 video_details = try_get(
1821 player_response, lambda x: x['videoDetails'], dict) or {}
1822
37357d21
S
1823 microformat = try_get(
1824 player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}
1825
8dbf751a
RA
1826 video_title = video_info.get('title', [None])[0] or video_details.get('title')
1827 if not video_title:
cf7e015f
S
1828 self._downloader.report_warning('Unable to extract video title')
1829 video_title = '_'
1830
9cafc3fd 1831 description_original = video_description = get_element_by_id("eow-description", video_webpage)
cf7e015f 1832 if video_description:
fa4bc6e7
RA
1833
1834 def replace_url(m):
1835 redir_url = compat_urlparse.urljoin(url, m.group(1))
1836 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1837 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1838 qs = compat_parse_qs(parsed_redir_url.query)
1839 q = qs.get('q')
1840 if q and q[0]:
1841 return q[0]
1842 return redir_url
1843
9cafc3fd 1844 description_original = video_description = re.sub(r'''(?x)
cf7e015f 1845 <a\s+
25cb7a0e 1846 (?:[a-zA-Z-]+="[^"]*"\s+)*?
23f13e97 1847 (?:title|href)="([^"]+)"\s+
25cb7a0e 1848 (?:[a-zA-Z-]+="[^"]*"\s+)*?
525cedb9 1849 class="[^"]*"[^>]*>
23f13e97 1850 [^<]+\.{3}\s*
cf7e015f 1851 </a>
fa4bc6e7 1852 ''', replace_url, video_description)
cf7e015f
S
1853 video_description = clean_html(video_description)
1854 else:
ea74e00b
DP
1855 video_description = video_details.get('shortDescription')
1856 if video_description is None:
1857 video_description = self._html_search_meta('description', video_webpage)
cf7e015f 1858
8fe10494 1859 if not smuggled_data.get('force_singlefeed', False):
5e1eddb9 1860 if not self._downloader.params.get('noplaylist'):
8fe10494
S
1861 multifeed_metadata_list = try_get(
1862 player_response,
1863 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
1864 compat_str) or try_get(
1865 video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
1866 if multifeed_metadata_list:
1867 entries = []
1868 feed_ids = []
1869 for feed in multifeed_metadata_list.split(','):
1870 # Unquote should take place before split on comma (,) since textual
1871 # fields may contain comma as well (see
067aa17e 1872 # https://github.com/ytdl-org/youtube-dl/issues/8536)
8fe10494 1873 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
6b09401b
S
1874
1875 def feed_entry(name):
1876 return try_get(feed_data, lambda x: x[name][0], compat_str)
1877
1878 feed_id = feed_entry('id')
1879 if not feed_id:
1880 continue
1881 feed_title = feed_entry('title')
1882 title = video_title
1883 if feed_title:
1884 title += ' (%s)' % feed_title
8fe10494
S
1885 entries.append({
1886 '_type': 'url_transparent',
1887 'ie_key': 'Youtube',
1888 'url': smuggle_url(
1889 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1890 {'force_singlefeed': True}),
6b09401b 1891 'title': title,
8fe10494 1892 })
6b09401b 1893 feed_ids.append(feed_id)
8fe10494
S
1894 self.to_screen(
1895 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1896 % (', '.join(feed_ids), video_id))
1897 return self.playlist_result(entries, video_id, video_title, video_description)
1898 else:
1899 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
cf7e015f 1900
c7121fa7 1901 if view_count is None:
1c9c8de2 1902 view_count = extract_view_count(video_info)
dbdaaa23
S
1903 if view_count is None and video_details:
1904 view_count = int_or_none(video_details.get('viewCount'))
7b16239a
S
1905 if view_count is None and microformat:
1906 view_count = int_or_none(microformat.get('viewCount'))
1d699755 1907
27019dbb 1908 if is_live is None:
898238e9 1909 is_live = bool_or_none(video_details.get('isLive'))
27019dbb 1910
321bf820 1911 has_live_chat_replay = False
f0f76a33 1912 if not is_live:
321bf820 1913 yt_initial_data = self._get_yt_initial_data(video_id, video_webpage)
1914 try:
1915 yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
1916 has_live_chat_replay = True
f0f76a33 1917 except (KeyError, IndexError, TypeError):
321bf820 1918 pass
1919
c5e8d7af
PH
1920 # Check for "rental" videos
1921 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
067aa17e 1922 raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
c5e8d7af 1923
c63ca0ee
S
1924 def _extract_filesize(media_url):
1925 return int_or_none(self._search_regex(
1926 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
1927
bf1317d2
S
1928 streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
1929 streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
1930
c5e8d7af
PH
1931 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1932 self.report_rtmp_download()
dd27fd17
PH
1933 formats = [{
1934 'format_id': '_rtmp',
1935 'protocol': 'rtmp',
1936 'url': video_info['conn'][0],
1937 'player_url': player_url,
1938 }]
bf1317d2 1939 elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
5f6a1245 1940 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 1941 if 'rtmpe%3Dyes' in encoded_url_map:
067aa17e 1942 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
bf1317d2 1943 formats = []
3318832e 1944 formats_spec = {}
82156fdb 1945 fmt_list = video_info.get('fmt_list', [''])[0]
1946 if fmt_list:
1947 for fmt in fmt_list.split(','):
1948 spec = fmt.split('/')
3318832e 1949 if len(spec) > 1:
1950 width_height = spec[1].split('x')
1951 if len(width_height) == 2:
1952 formats_spec[spec[0]] = {
1953 'resolution': spec[1],
1954 'width': int_or_none(width_height[0]),
1955 'height': int_or_none(width_height[1]),
1956 }
bf1317d2
S
1957 for fmt in streaming_formats:
1958 itag = str_or_none(fmt.get('itag'))
1959 if not itag:
201e9eaa 1960 continue
bf1317d2
S
1961 quality = fmt.get('quality')
1962 quality_label = fmt.get('qualityLabel') or quality
1963 formats_spec[itag] = {
1964 'asr': int_or_none(fmt.get('audioSampleRate')),
1965 'filesize': int_or_none(fmt.get('contentLength')),
1966 'format_note': quality_label,
1967 'fps': int_or_none(fmt.get('fps')),
1968 'height': int_or_none(fmt.get('height')),
bf1317d2
S
1969 # bitrate for itag 43 is always 2147483647
1970 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
1971 'width': int_or_none(fmt.get('width')),
1972 }
1973
1974 for fmt in streaming_formats:
00eb865b 1975 if fmt.get('drmFamilies') or fmt.get('drm_families'):
bf1317d2
S
1976 continue
1977 url = url_or_none(fmt.get('url'))
1978
1979 if not url:
fa3db383 1980 cipher = fmt.get('cipher') or fmt.get('signatureCipher')
bf1317d2
S
1981 if not cipher:
1982 continue
1983 url_data = compat_parse_qs(cipher)
1984 url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
1985 if not url:
1986 continue
1987 else:
1988 cipher = None
1989 url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
1990
2f483bc1
S
1991 stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
1992 # Unsupported FORMAT_STREAM_TYPE_OTF
1993 if stream_type == 3:
1994 continue
6449cd80 1995
bf1317d2
S
1996 format_id = fmt.get('itag') or url_data['itag'][0]
1997 if not format_id:
1998 continue
1999 format_id = compat_str(format_id)
a49eccdf 2000
bf1317d2
S
2001 if cipher:
2002 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
8bdd16b4 2003 ASSETS_RE = (
2004 r'<script[^>]+\bsrc=("[^"]+")[^>]+\bname=["\']player_ias/base',
2005 r'"jsUrl"\s*:\s*("[^"]+")',
2006 r'"assets":.+?"js":\s*("[^"]+")')
bf1317d2
S
2007 jsplayer_url_json = self._search_regex(
2008 ASSETS_RE,
2009 embed_webpage if age_gate else video_webpage,
2010 'JS player URL (1)', default=None)
2011 if not jsplayer_url_json and not age_gate:
2012 # We need the embed website after all
2013 if embed_webpage is None:
2014 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
2015 embed_webpage = self._download_webpage(
2016 embed_url, video_id, 'Downloading embed webpage')
2017 jsplayer_url_json = self._search_regex(
2018 ASSETS_RE, embed_webpage, 'JS player URL')
2019
2020 player_url = json.loads(jsplayer_url_json)
cf010131 2021 if player_url is None:
bf1317d2
S
2022 player_url_json = self._search_regex(
2023 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
2024 video_webpage, 'age gate player URL')
2025 player_url = json.loads(player_url_json)
2026
2027 if 'sig' in url_data:
2028 url += '&signature=' + url_data['sig'][0]
2029 elif 's' in url_data:
2030 encrypted_sig = url_data['s'][0]
2031
2032 if self._downloader.params.get('verbose'):
2033 if player_url is None:
bf1317d2 2034 player_desc = 'unknown'
cf010131 2035 else:
e40c758c
S
2036 player_type, player_version = self._extract_player_info(player_url)
2037 player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version)
bf1317d2
S
2038 parts_sizes = self._signature_cache_id(encrypted_sig)
2039 self.to_screen('{%s} signature length %s, %s' %
2040 (format_id, parts_sizes, player_desc))
2041
2042 signature = self._decrypt_signature(
2043 encrypted_sig, video_id, player_url, age_gate)
2044 sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
2045 url += '&%s=%s' % (sp, signature)
201e9eaa
PH
2046 if 'ratebypass' not in url:
2047 url += '&ratebypass=yes'
c9afb51c 2048
94278f72
YCH
2049 dct = {
2050 'format_id': format_id,
2051 'url': url,
2052 'player_url': player_url,
2053 }
2054 if format_id in self._formats:
2055 dct.update(self._formats[format_id])
3318832e 2056 if format_id in formats_spec:
2057 dct.update(formats_spec[format_id])
94278f72 2058
aabc2be6 2059 # Some itags are not included in DASH manifest thus corresponding formats will
067aa17e 2060 # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).
aabc2be6
S
2061 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
2062 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
2063 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
94278f72 2064
bf1317d2
S
2065 if width is None:
2066 width = int_or_none(fmt.get('width'))
2067 if height is None:
2068 height = int_or_none(fmt.get('height'))
2069
c63ca0ee
S
2070 filesize = int_or_none(url_data.get(
2071 'clen', [None])[0]) or _extract_filesize(url)
2072
bf1317d2
S
2073 quality = url_data.get('quality', [None])[0] or fmt.get('quality')
2074 quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
2075
4878759f
S
2076 tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
2077 or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
bf1317d2 2078 fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
54fc90aa 2079
94278f72 2080 more_fields = {
c63ca0ee 2081 'filesize': filesize,
bf1317d2 2082 'tbr': tbr,
c9afb51c
AH
2083 'width': width,
2084 'height': height,
bf1317d2
S
2085 'fps': fps,
2086 'format_note': quality_label or quality,
c9afb51c 2087 }
94278f72
YCH
2088 for key, value in more_fields.items():
2089 if value:
2090 dct[key] = value
bf1317d2 2091 type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
aabc2be6
S
2092 if type_:
2093 type_split = type_.split(';')
2094 kind_ext = type_split[0].split('/')
2095 if len(kind_ext) == 2:
94278f72
YCH
2096 kind, _ = kind_ext
2097 dct['ext'] = mimetype2ext(type_split[0])
aabc2be6
S
2098 if kind in ('audio', 'video'):
2099 codecs = None
2100 for mobj in re.finditer(
2101 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
2102 if mobj.group('key') == 'codecs':
2103 codecs = mobj.group('val')
2104 break
2105 if codecs:
6310acf5 2106 dct.update(parse_codecs(codecs))
e4a60912
S
2107 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
2108 dct['downloader_options'] = {
2109 # Youtube throttles chunks >~10M
2110 'http_chunk_size': 10485760,
2111 }
aabc2be6 2112 formats.append(dct)
c5e8d7af 2113 else:
c3e54389
S
2114 manifest_url = (
2115 url_or_none(try_get(
2116 player_response,
2117 lambda x: x['streamingData']['hlsManifestUrl'],
3089bc74
S
2118 compat_str))
2119 or url_or_none(try_get(
c3e54389
S
2120 video_info, lambda x: x['hlsvp'][0], compat_str)))
2121 if manifest_url:
2122 formats = []
2123 m3u8_formats = self._extract_m3u8_formats(
2124 manifest_url, video_id, 'mp4', fatal=False)
2125 for a_format in m3u8_formats:
2126 itag = self._search_regex(
2127 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
2128 if itag:
2129 a_format['format_id'] = itag
2130 if itag in self._formats:
2131 dct = self._formats[itag].copy()
2132 dct.update(a_format)
2133 a_format = dct
2134 a_format['player_url'] = player_url
2135 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
2136 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
78895bd3
U
2137 if self._downloader.params.get('youtube_include_hls_manifest', True):
2138 formats.append(a_format)
c3e54389 2139 else:
13577349 2140 error_message = extract_unavailable_message()
c3e54389 2141 if not error_message:
13577349
S
2142 error_message = clean_html(try_get(
2143 player_response, lambda x: x['playabilityStatus']['reason'],
2144 compat_str))
2145 if not error_message:
2146 error_message = clean_html(
2147 try_get(video_info, lambda x: x['reason'][0], compat_str))
c3e54389
S
2148 if error_message:
2149 raise ExtractorError(error_message, expected=True)
2150 raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 2151
7e72694b 2152 # uploader
dbdaaa23
S
2153 video_uploader = try_get(
2154 video_info, lambda x: x['author'][0],
2155 compat_str) or str_or_none(video_details.get('author'))
7e72694b
S
2156 if video_uploader:
2157 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
2158 else:
2159 self._downloader.report_warning('unable to extract uploader name')
2160
2161 # uploader_id
2162 video_uploader_id = None
2163 video_uploader_url = None
2164 mobj = re.search(
2165 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
2166 video_webpage)
2167 if mobj is not None:
2168 video_uploader_id = mobj.group('uploader_id')
2169 video_uploader_url = mobj.group('uploader_url')
a6211d23
S
2170 else:
2171 owner_profile_url = url_or_none(microformat.get('ownerProfileUrl'))
2172 if owner_profile_url:
2173 video_uploader_id = self._search_regex(
2174 r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id',
2175 default=None)
2176 video_uploader_url = owner_profile_url
7e72694b 2177
b45a9e69 2178 channel_id = (
3089bc74
S
2179 str_or_none(video_details.get('channelId'))
2180 or self._html_search_meta(
2181 'channelId', video_webpage, 'channel id', default=None)
2182 or self._search_regex(
b45a9e69 2183 r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
2184 video_webpage, 'channel id', default=None, group='id'))
dd4c4492
S
2185 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
2186
b477fc13
S
2187 thumbnails = []
2188 thumbnails_list = try_get(
2189 video_details, lambda x: x['thumbnail']['thumbnails'], list) or []
2190 for t in thumbnails_list:
2191 if not isinstance(t, dict):
2192 continue
2193 thumbnail_url = url_or_none(t.get('url'))
2194 if not thumbnail_url:
2195 continue
2196 thumbnails.append({
2197 'url': thumbnail_url,
2198 'width': int_or_none(t.get('width')),
2199 'height': int_or_none(t.get('height')),
2200 })
2201
2202 if not thumbnails:
7e72694b 2203 video_thumbnail = None
b477fc13
S
2204 # We try first to get a high quality image:
2205 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
2206 video_webpage, re.DOTALL)
2207 if m_thumb is not None:
2208 video_thumbnail = m_thumb.group(1)
2209 thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str)
2210 if thumbnail_url:
2211 video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url)
2212 if video_thumbnail:
2213 thumbnails.append({'url': video_thumbnail})
7e72694b
S
2214
2215 # upload date
2216 upload_date = self._html_search_meta(
2217 'datePublished', video_webpage, 'upload date', default=None)
2218 if not upload_date:
2219 upload_date = self._search_regex(
2220 [r'(?s)id="eow-date.*?>(.*?)</span>',
2221 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
2222 video_webpage, 'upload date', default=None)
37357d21
S
2223 if not upload_date:
2224 upload_date = microformat.get('publishDate') or microformat.get('uploadDate')
7e72694b
S
2225 upload_date = unified_strdate(upload_date)
2226
2227 video_license = self._html_search_regex(
2228 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
2229 video_webpage, 'license', default=None)
2230
2231 m_music = re.search(
2232 r'''(?x)
2233 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
2234 <ul[^>]*>\s*
2235 <li>(?P<title>.+?)
2236 by (?P<creator>.+?)
2237 (?:
2238 \(.+?\)|
2239 <a[^>]*
2240 (?:
2241 \bhref=["\']/red[^>]*>| # drop possible
2242 >\s*Listen ad-free with YouTube Red # YouTube Red ad
2243 )
2244 .*?
2245 )?</li
2246 ''',
2247 video_webpage)
2248 if m_music:
2249 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
2250 video_creator = clean_html(m_music.group('creator'))
2251 else:
2252 video_alt_title = video_creator = None
2253
2254 def extract_meta(field):
2255 return self._html_search_regex(
2256 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
2257 video_webpage, field, default=None)
2258
2259 track = extract_meta('Song')
2260 artist = extract_meta('Artist')
92bc97d3 2261 album = extract_meta('Album')
822b9d9c
RA
2262
2263 # Youtube Music Auto-generated description
92bc97d3 2264 release_date = release_year = None
822b9d9c
RA
2265 if video_description:
2266 mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description)
2267 if mobj:
2268 if not track:
2269 track = mobj.group('track').strip()
2270 if not artist:
2271 artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))
92bc97d3
RA
2272 if not album:
2273 album = mobj.group('album'.strip())
822b9d9c
RA
2274 release_year = mobj.group('release_year')
2275 release_date = mobj.group('release_date')
2276 if release_date:
2277 release_date = release_date.replace('-', '')
2278 if not release_year:
2279 release_year = int(release_date[:4])
2280 if release_year:
2281 release_year = int(release_year)
7e72694b 2282
9322f116 2283 yt_initial = self._get_yt_initial_data(video_id, video_webpage)
2284 if yt_initial:
2285 music_metadata = self._get_music_metadata_from_yt_initial(yt_initial)
2286 if len(music_metadata):
2287 album = music_metadata[0].get('album')
2288 artist = music_metadata[0].get('artist')
2289 track = music_metadata[0].get('track')
2290
7e72694b
S
2291 m_episode = re.search(
2292 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
2293 video_webpage)
2294 if m_episode:
c2dd2dc0 2295 series = unescapeHTML(m_episode.group('series'))
7e72694b
S
2296 season_number = int(m_episode.group('season'))
2297 episode_number = int(m_episode.group('episode'))
2298 else:
2299 series = season_number = episode_number = None
2300
2301 m_cat_container = self._search_regex(
2302 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
2303 video_webpage, 'categories', default=None)
dbeafce5 2304 category = None
7e72694b
S
2305 if m_cat_container:
2306 category = self._html_search_regex(
2307 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
2308 default=None)
dbeafce5
S
2309 if not category:
2310 category = try_get(
2311 microformat, lambda x: x['category'], compat_str)
2312 video_categories = None if category is None else [category]
7e72694b
S
2313
2314 video_tags = [
2315 unescapeHTML(m.group('content'))
2316 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
dbeafce5
S
2317 if not video_tags:
2318 video_tags = try_get(video_details, lambda x: x['keywords'], list)
7e72694b
S
2319
2320 def _extract_count(count_name):
2321 return str_to_int(self._search_regex(
8bdd16b4 2322 r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
7e72694b
S
2323 % re.escape(count_name),
2324 video_webpage, count_name, default=None))
2325
2326 like_count = _extract_count('like')
2327 dislike_count = _extract_count('dislike')
2328
dbdaaa23
S
2329 if view_count is None:
2330 view_count = str_to_int(self._search_regex(
2331 r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
2332 'view count', default=None))
2333
bf3c9326
S
2334 average_rating = (
2335 float_or_none(video_details.get('averageRating'))
2336 or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
2337
7e72694b 2338 # subtitles
321bf820 2339 video_subtitles = self.extract_subtitles(
2340 video_id, video_webpage, has_live_chat_replay)
7e72694b
S
2341 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
2342
2343 video_duration = try_get(
2344 video_info, lambda x: int_or_none(x['length_seconds'][0]))
dbdaaa23
S
2345 if not video_duration:
2346 video_duration = int_or_none(video_details.get('lengthSeconds'))
7e72694b
S
2347 if not video_duration:
2348 video_duration = parse_duration(self._html_search_meta(
2349 'duration', video_webpage, 'video duration'))
2350
b84071c0
JP
2351 # Get Subscriber Count of channel
2352 subscriber_count = parse_count(self._search_regex(
2353 r'"text":"([\d\.]+\w?) subscribers"',
2354 video_webpage,
2355 'subscriber count',
2356 default=None
2357 ))
2358
7e72694b
S
2359 # annotations
2360 video_annotations = None
2361 if self._downloader.params.get('writeannotations', False):
64b6a4e9
RA
2362 xsrf_token = self._search_regex(
2363 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2',
2364 video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
2365 invideo_url = try_get(
2366 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
2367 if xsrf_token and invideo_url:
2368 xsrf_field_name = self._search_regex(
2369 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
2370 video_webpage, 'xsrf field name',
2371 group='xsrf_field_name', default='session_token')
2372 video_annotations = self._download_webpage(
2373 self._proto_relative_url(invideo_url),
2374 video_id, note='Downloading annotations',
2375 errnote='Unable to download video annotations', fatal=False,
2376 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
7e72694b 2377
84213ea8 2378 chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
7e72694b 2379
dd27fd17 2380 # Look for the DASH manifest
203fb43f 2381 if self._downloader.params.get('youtube_include_dash_manifest', True):
77c6fb5b 2382 dash_mpd_fatal = True
8ff648e4 2383 for mpd_url in dash_mpds:
d8d24a92 2384 dash_formats = {}
774e208f 2385 try:
05d0d131
YCH
2386 def decrypt_sig(mobj):
2387 s = mobj.group(1)
2388 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2389 return '/signature/%s' % dec_s
2390
8ff648e4 2391 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2d2fa82d 2392
8ff648e4 2393 for df in self._extract_mpd_formats(
2394 mpd_url, video_id, fatal=dash_mpd_fatal,
2395 formats_dict=self._formats):
c63ca0ee
S
2396 if not df.get('filesize'):
2397 df['filesize'] = _extract_filesize(df['url'])
d8d24a92
S
2398 # Do not overwrite DASH format found in some previous DASH manifest
2399 if df['format_id'] not in dash_formats:
2400 dash_formats[df['format_id']] = df
77c6fb5b
S
2401 # Additional DASH manifests may end up in HTTP Error 403 therefore
2402 # allow them to fail without bug report message if we already have
2403 # some DASH manifest succeeded. This is temporary workaround to reduce
2404 # burst of bug reports until we figure out the reason and whether it
2405 # can be fixed at all.
2406 dash_mpd_fatal = False
774e208f
PH
2407 except (ExtractorError, KeyError) as e:
2408 self.report_warning(
2409 'Skipping DASH manifest: %r' % e, video_id)
d8d24a92 2410 if dash_formats:
04b3b3df
JMF
2411 # Remove the formats we found through non-DASH, they
2412 # contain less info and it can be wrong, because we use
2413 # fixed values (for example the resolution). See
067aa17e 2414 # https://github.com/ytdl-org/youtube-dl/issues/5774 for an
04b3b3df 2415 # example.
d80265cc 2416 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
d8d24a92 2417 formats.extend(dash_formats.values())
d80044c2 2418
6271f1ca
PH
2419 # Check for malformed aspect ratio
2420 stretched_m = re.search(
2421 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2422 video_webpage)
2423 if stretched_m:
313dfc45
LL
2424 w = float(stretched_m.group('w'))
2425 h = float(stretched_m.group('h'))
5faf9fed
S
2426 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2427 # We will only process correct ratios.
313dfc45 2428 if w > 0 and h > 0:
41f24c32 2429 ratio = w / h
313dfc45
LL
2430 for f in formats:
2431 if f.get('vcodec') != 'none':
2432 f['stretched_ratio'] = ratio
6271f1ca 2433
026fbedc 2434 if not formats:
43ebf77d
S
2435 if 'reason' in video_info:
2436 if 'The uploader has not made this video available in your country.' in video_info['reason']:
2437 regions_allowed = self._html_search_meta(
2438 'regionsAllowed', video_webpage, default=None)
2439 countries = regions_allowed.split(',') if regions_allowed else None
2440 self.raise_geo_restricted(
2441 msg=video_info['reason'][0], countries=countries)
2442 reason = video_info['reason'][0]
2443 if 'Invalid parameters' in reason:
2444 unavailable_message = extract_unavailable_message()
2445 if unavailable_message:
2446 reason = unavailable_message
2447 raise ExtractorError(
2448 'YouTube said: %s' % reason,
2449 expected=True, video_id=video_id)
2450 if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']):
2451 raise ExtractorError('This video is DRM protected.', expected=True)
0d297518 2452
4bcc7bd1 2453 self._sort_formats(formats)
4ea3be0a 2454
21c340b8 2455 self.mark_watched(video_id, video_info, player_response)
d77ab8e2 2456
4ea3be0a 2457 return {
8bcc8756
JW
2458 'id': video_id,
2459 'uploader': video_uploader,
2460 'uploader_id': video_uploader_id,
fd050249 2461 'uploader_url': video_uploader_url,
dd4c4492
S
2462 'channel_id': channel_id,
2463 'channel_url': channel_url,
8bcc8756 2464 'upload_date': upload_date,
7caf9830 2465 'license': video_license,
936784b2 2466 'creator': video_creator or artist,
8bcc8756 2467 'title': video_title,
936784b2 2468 'alt_title': video_alt_title or track,
b477fc13 2469 'thumbnails': thumbnails,
8bcc8756
JW
2470 'description': video_description,
2471 'categories': video_categories,
000b6b5a 2472 'tags': video_tags,
8bcc8756 2473 'subtitles': video_subtitles,
360e1ca5 2474 'automatic_captions': automatic_captions,
8bcc8756
JW
2475 'duration': video_duration,
2476 'age_limit': 18 if age_gate else 0,
2477 'annotations': video_annotations,
9cafc3fd 2478 'chapters': chapters,
7e8c0af0 2479 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 2480 'view_count': view_count,
4ea3be0a 2481 'like_count': like_count,
2482 'dislike_count': dislike_count,
bf3c9326 2483 'average_rating': average_rating,
8bcc8756 2484 'formats': formats,
2fe1ff85 2485 'is_live': is_live,
7c80519c 2486 'start_time': start_time,
297a564b 2487 'end_time': end_time,
12afdc2a
S
2488 'series': series,
2489 'season_number': season_number,
2490 'episode_number': episode_number,
936784b2
S
2491 'track': track,
2492 'artist': artist,
5caabd3c 2493 'album': album,
2494 'release_date': release_date,
2495 'release_year': release_year,
b84071c0 2496 'subscriber_count': subscriber_count,
4ea3be0a 2497 }
c5e8d7af 2498
5f6a1245 2499
8bdd16b4 2500class YoutubeTabIE(YoutubeBaseInfoExtractor):
2501 IE_DESC = 'YouTube.com tab'
3462ffa8 2502 # (?x)^ will cause warning in LiveIE. So I cant split this into multiple lines using '''
2503 _VALID_URL = (
2504 r'https?://(?:\w+\.)?(?:youtube(?:kids)?\.com|invidio\.us)/'
2505 r'(?:(?!(%s)([/#?]|$))|'
2506 r'(?:channel|c|user)/|'
2507 r'(?:playlist|watch)\?.*?\blist=)'
2508 r'(?P<id>[^/?#&]+)') % YoutubeBaseInfoExtractor._RESERVED_NAMES
8bdd16b4 2509 IE_NAME = 'youtube:tab'
2510
81127aa5 2511 _TESTS = [{
8bdd16b4 2512 # playlists, multipage
2513 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid',
2514 'playlist_mincount': 94,
2515 'info_dict': {
2516 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2517 'title': 'Игорь Клейнер - Playlists',
2518 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
2519 },
2520 }, {
2521 # playlists, multipage, different order
2522 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
2523 'playlist_mincount': 94,
2524 'info_dict': {
2525 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2526 'title': 'Игорь Клейнер - Playlists',
2527 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
2528 },
2529 }, {
2530 # playlists, singlepage
2531 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
2532 'playlist_mincount': 4,
2533 'info_dict': {
2534 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
2535 'title': 'ThirstForScience - Playlists',
2536 'description': 'md5:609399d937ea957b0f53cbffb747a14c',
2537 }
2538 }, {
2539 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
2540 'only_matching': True,
2541 }, {
2542 # basic, single video playlist
0e30a7b9 2543 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
81127aa5 2544 'info_dict': {
0e30a7b9 2545 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2546 'uploader': 'Sergey M.',
2547 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3867038a 2548 'title': 'youtube-dl public playlist',
81127aa5 2549 },
0e30a7b9 2550 'playlist_count': 1,
9291475f 2551 }, {
8bdd16b4 2552 # empty playlist
0e30a7b9 2553 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
9291475f 2554 'info_dict': {
0e30a7b9 2555 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2556 'uploader': 'Sergey M.',
2557 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
3867038a 2558 'title': 'youtube-dl empty playlist',
9291475f
PH
2559 },
2560 'playlist_count': 0,
2561 }, {
8bdd16b4 2562 # Home tab
2563 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured',
9291475f 2564 'info_dict': {
8bdd16b4 2565 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2566 'title': 'lex will - Home',
2567 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
9291475f 2568 },
8bdd16b4 2569 'playlist_mincount': 2,
9291475f 2570 }, {
8bdd16b4 2571 # Videos tab
2572 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos',
9291475f 2573 'info_dict': {
8bdd16b4 2574 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2575 'title': 'lex will - Videos',
2576 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
9291475f 2577 },
8bdd16b4 2578 'playlist_mincount': 975,
9291475f 2579 }, {
8bdd16b4 2580 # Videos tab, sorted by popular
2581 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid',
9291475f 2582 'info_dict': {
8bdd16b4 2583 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2584 'title': 'lex will - Videos',
2585 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
9291475f 2586 },
8bdd16b4 2587 'playlist_mincount': 199,
9291475f 2588 }, {
8bdd16b4 2589 # Playlists tab
2590 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists',
9291475f 2591 'info_dict': {
8bdd16b4 2592 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2593 'title': 'lex will - Playlists',
2594 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
9291475f 2595 },
8bdd16b4 2596 'playlist_mincount': 17,
ac7553d0 2597 }, {
8bdd16b4 2598 # Community tab
2599 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community',
ac7553d0 2600 'info_dict': {
8bdd16b4 2601 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2602 'title': 'lex will - Community',
2603 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2604 },
2605 'playlist_mincount': 18,
87dadd45 2606 }, {
8bdd16b4 2607 # Channels tab
2608 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels',
87dadd45 2609 'info_dict': {
8bdd16b4 2610 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2611 'title': 'lex will - Channels',
2612 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2613 },
2614 'playlist_mincount': 138,
6b08cdf6 2615 }, {
8bdd16b4 2616 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',
2617 'only_matching': True,
2618 }, {
2619 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA',
2620 'only_matching': True,
2621 }, {
2622 'url': 'https://music.youtube.com/channel/UCT-K0qO8z6NzWrywqefBPBQ',
2623 'only_matching': True,
2624 }, {
2625 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2626 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2627 'info_dict': {
2628 'title': '29C3: Not my department',
2629 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2630 'uploader': 'Christiaan008',
2631 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
2632 },
2633 'playlist_count': 96,
2634 }, {
2635 'note': 'Large playlist',
2636 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
6b08cdf6 2637 'info_dict': {
8bdd16b4 2638 'title': 'Uploads from Cauchemar',
2639 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
2640 'uploader': 'Cauchemar',
2641 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
13a75688 2642 },
8bdd16b4 2643 'playlist_mincount': 1123,
2644 }, {
2645 # even larger playlist, 8832 videos
2646 'url': 'http://www.youtube.com/user/NASAgovVideo/videos',
2647 'only_matching': True,
4b7df0d3
JMF
2648 }, {
2649 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2650 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2651 'info_dict': {
acf757f4
PH
2652 'title': 'Uploads from Interstellar Movie',
2653 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
13a75688 2654 'uploader': 'Interstellar Movie',
8bdd16b4 2655 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
4b7df0d3 2656 },
481cc733 2657 'playlist_mincount': 21,
8bdd16b4 2658 }, {
2659 # https://github.com/ytdl-org/youtube-dl/issues/21844
2660 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2661 'info_dict': {
2662 'title': 'Data Analysis with Dr Mike Pound',
2663 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2664 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
2665 'uploader': 'Computerphile',
2666 },
2667 'playlist_mincount': 11,
2668 }, {
2669 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',
2670 'only_matching': True,
dacb3a86
S
2671 }, {
2672 # Playlist URL that does not actually serve a playlist
2673 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2674 'info_dict': {
2675 'id': 'FqZTN594JQw',
2676 'ext': 'webm',
2677 'title': "Smiley's People 01 detective, Adventure Series, Action",
2678 'uploader': 'STREEM',
2679 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
ec85ded8 2680 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
dacb3a86
S
2681 'upload_date': '20150526',
2682 'license': 'Standard YouTube License',
2683 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2684 'categories': ['People & Blogs'],
2685 'tags': list,
dbdaaa23 2686 'view_count': int,
dacb3a86
S
2687 'like_count': int,
2688 'dislike_count': int,
2689 },
2690 'params': {
2691 'skip_download': True,
2692 },
13a75688 2693 'skip': 'This video is not available.',
dacb3a86 2694 'add_ie': [YoutubeIE.ie_key()],
481cc733 2695 }, {
8bdd16b4 2696 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
c0345b82 2697 'only_matching': True,
66b48727 2698 }, {
8bdd16b4 2699 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
66b48727 2700 'only_matching': True,
81127aa5 2701 }]
c5e8d7af 2702
8bdd16b4 2703 @classmethod
2704 def suitable(cls, url):
3462ffa8 2705 IGNORE = (YoutubeLiveIE,)
2706 return (
2707 False if any(ie.suitable(url) for ie in IGNORE)
2708 else super(YoutubeTabIE, cls).suitable(url))
8bdd16b4 2709
2710 def _extract_channel_id(self, webpage):
2711 channel_id = self._html_search_meta(
2712 'channelId', webpage, 'channel id', default=None)
2713 if channel_id:
2714 return channel_id
2715 channel_url = self._html_search_meta(
2716 ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url',
2717 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad',
2718 'twitter:app:url:googleplay'), webpage, 'channel url')
2719 return self._search_regex(
2720 r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+',
2721 channel_url, 'channel id')
15f6397c 2722
8bdd16b4 2723 @staticmethod
2724 def _extract_grid_item_renderer(item):
2725 for item_kind in ('Playlist', 'Video', 'Channel'):
2726 renderer = item.get('grid%sRenderer' % item_kind)
2727 if renderer:
2728 return renderer
2729
2730 def _extract_video(self, renderer):
2731 video_id = renderer.get('videoId')
2732 title = try_get(
2733 renderer,
2734 (lambda x: x['title']['runs'][0]['text'],
2735 lambda x: x['title']['simpleText']), compat_str)
2736 description = try_get(
2737 renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'],
2738 compat_str)
2739 duration = parse_duration(try_get(
2740 renderer, lambda x: x['lengthText']['simpleText'], compat_str))
2741 view_count_text = try_get(
2742 renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
2743 view_count = str_to_int(self._search_regex(
2744 r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
2745 'view count', default=None))
2746 uploader = try_get(
2747 renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
2748 return {
2749 '_type': 'url_transparent',
2750 'ie_key': YoutubeIE.ie_key(),
2751 'id': video_id,
2752 'url': video_id,
2753 'title': title,
2754 'description': description,
2755 'duration': duration,
2756 'view_count': view_count,
2757 'uploader': uploader,
2758 }
652cdaa2 2759
8bdd16b4 2760 def _grid_entries(self, grid_renderer):
2761 for item in grid_renderer['items']:
2762 if not isinstance(item, dict):
39b62db1 2763 continue
8bdd16b4 2764 renderer = self._extract_grid_item_renderer(item)
2765 if not isinstance(renderer, dict):
2766 continue
2767 title = try_get(
2768 renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
2769 # playlist
2770 playlist_id = renderer.get('playlistId')
2771 if playlist_id:
2772 yield self.url_result(
2773 'https://www.youtube.com/playlist?list=%s' % playlist_id,
2774 ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
2775 video_title=title)
2776 # video
2777 video_id = renderer.get('videoId')
2778 if video_id:
2779 yield self._extract_video(renderer)
2780 # channel
2781 channel_id = renderer.get('channelId')
2782 if channel_id:
2783 title = try_get(
2784 renderer, lambda x: x['title']['simpleText'], compat_str)
2785 yield self.url_result(
2786 'https://www.youtube.com/channel/%s' % channel_id,
2787 ie=YoutubeTabIE.ie_key(), video_title=title)
2788
2789 def _shelf_entries_trimmed(self, shelf_renderer):
2790 renderer = try_get(
2791 shelf_renderer, lambda x: x['content']['horizontalListRenderer'], dict)
2792 if not renderer:
2793 return
2794 # TODO: add support for nested playlists so each shelf is processed
2795 # as separate playlist
2796 # TODO: this includes only first N items
2797 for entry in self._grid_entries(renderer):
2798 yield entry
2799
2800 def _shelf_entries(self, shelf_renderer):
2801 ep = try_get(
2802 shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
2803 compat_str)
2804 shelf_url = urljoin('https://www.youtube.com', ep)
2805 if not shelf_url:
2806 return
2807 title = try_get(
2808 shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
2809 yield self.url_result(shelf_url, video_title=title)
c5e8d7af 2810
8bdd16b4 2811 def _playlist_entries(self, video_list_renderer):
2812 for content in video_list_renderer['contents']:
2813 if not isinstance(content, dict):
2814 continue
2815 renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer')
2816 if not isinstance(renderer, dict):
2817 continue
2818 video_id = renderer.get('videoId')
2819 if not video_id:
2820 continue
2821 yield self._extract_video(renderer)
07aeced6 2822
3462ffa8 2823 def _itemSection_entries(self, item_sect_renderer):
2824 for content in item_sect_renderer['contents']:
2825 if not isinstance(content, dict):
2826 continue
2827 renderer = content.get('videoRenderer', {})
2828 if not isinstance(renderer, dict):
2829 continue
2830 video_id = renderer.get('videoId')
2831 if not video_id:
2832 continue
2833 yield self._extract_video(renderer)
2834
2835 def _rich_entries(self, rich_grid_renderer):
2836 renderer = try_get(
2837 rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict)
2838 video_id = renderer.get('videoId')
2839 if not video_id:
2840 return
2841 yield self._extract_video(renderer)
2842
8bdd16b4 2843 def _video_entry(self, video_renderer):
2844 video_id = video_renderer.get('videoId')
2845 if video_id:
2846 return self._extract_video(video_renderer)
dacb3a86 2847
8bdd16b4 2848 def _post_thread_entries(self, post_thread_renderer):
2849 post_renderer = try_get(
2850 post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict)
2851 if not post_renderer:
2852 return
2853 # video attachment
2854 video_renderer = try_get(
2855 post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict)
2856 video_id = None
2857 if video_renderer:
2858 entry = self._video_entry(video_renderer)
2859 if entry:
2860 yield entry
2861 # inline video links
2862 runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or []
2863 for run in runs:
2864 if not isinstance(run, dict):
2865 continue
2866 ep_url = try_get(
2867 run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str)
2868 if not ep_url:
2869 continue
2870 if not YoutubeIE.suitable(ep_url):
2871 continue
2872 ep_video_id = YoutubeIE._match_id(ep_url)
2873 if video_id == ep_video_id:
2874 continue
2875 yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=video_id)
dacb3a86 2876
8bdd16b4 2877 def _post_thread_continuation_entries(self, post_thread_continuation):
2878 contents = post_thread_continuation.get('contents')
2879 if not isinstance(contents, list):
2880 return
2881 for content in contents:
2882 renderer = content.get('backstagePostThreadRenderer')
2883 if not isinstance(renderer, dict):
2884 continue
2885 for entry in self._post_thread_entries(renderer):
2886 yield entry
07aeced6 2887
8bdd16b4 2888 @staticmethod
2889 def _extract_next_continuation_data(renderer):
2890 next_continuation = try_get(
2891 renderer, lambda x: x['continuations'][0]['nextContinuationData'], dict)
2892 if not next_continuation:
2893 return
2894 continuation = next_continuation.get('continuation')
2895 if not continuation:
2896 return
2897 ctp = next_continuation.get('clickTrackingParams')
2898 return {
2899 'ctoken': continuation,
2900 'continuation': continuation,
2901 'itct': ctp,
2902 }
c5e8d7af 2903
8bdd16b4 2904 @classmethod
2905 def _extract_continuation(cls, renderer):
2906 next_continuation = cls._extract_next_continuation_data(renderer)
2907 if next_continuation:
2908 return next_continuation
2909 contents = renderer.get('contents')
2910 if not isinstance(contents, list):
2911 return
2912 for content in contents:
2913 if not isinstance(content, dict):
2914 continue
2915 continuation_ep = try_get(
2916 content, lambda x: x['continuationItemRenderer']['continuationEndpoint'],
2917 dict)
2918 if not continuation_ep:
2919 continue
2920 continuation = try_get(
2921 continuation_ep, lambda x: x['continuationCommand']['token'], compat_str)
2922 if not continuation:
2923 continue
2924 ctp = continuation_ep.get('clickTrackingParams')
2925 if not ctp:
2926 continue
2927 return {
2928 'ctoken': continuation,
2929 'continuation': continuation,
2930 'itct': ctp,
2931 }
448830ce 2932
8bdd16b4 2933 def _entries(self, tab, identity_token):
3462ffa8 2934
2935 def extract_entries(parent_renderer):
2936 slr_contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
2937 for slr_content in slr_contents:
2938 if not isinstance(slr_content, dict):
8bdd16b4 2939 continue
3462ffa8 2940 is_renderer = try_get(slr_content, lambda x: x['itemSectionRenderer'], dict)
2941 if not is_renderer:
2942 renderer = slr_content.get('richItemRenderer')
2943 if renderer:
2944 for entry in self._rich_entries(renderer):
2945 yield entry
2946 continuation_list[0] = self._extract_continuation(parent_renderer)
8bdd16b4 2947 continue
3462ffa8 2948 isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
2949 for isr_content in isr_contents:
2950 if not isinstance(isr_content, dict):
2951 continue
2952 renderer = isr_content.get('playlistVideoListRenderer')
2953 if renderer:
2954 for entry in self._playlist_entries(renderer):
2955 yield entry
2956 continuation_list[0] = self._extract_continuation(renderer)
2957 continue
2958 renderer = isr_content.get('gridRenderer')
2959 if renderer:
2960 for entry in self._grid_entries(renderer):
2961 yield entry
2962 continuation_list[0] = self._extract_continuation(renderer)
2963 continue
2964 renderer = isr_content.get('shelfRenderer')
2965 if renderer:
2966 for entry in self._shelf_entries(renderer):
2967 yield entry
2968 continuation_list[0] = self._extract_continuation(parent_renderer)
2969 continue
2970 renderer = isr_content.get('backstagePostThreadRenderer')
2971 if renderer:
2972 for entry in self._post_thread_entries(renderer):
2973 yield entry
2974 continuation_list[0] = self._extract_continuation(renderer)
2975 continue
2976 renderer = isr_content.get('videoRenderer')
2977 if renderer:
2978 entry = self._video_entry(renderer)
2979 if entry:
2980 yield entry
2981 if not continuation_list[0]:
2982 continuation_list[0] = self._extract_continuation(is_renderer)
2983 if not continuation_list[0]:
2984 continuation_list[0] = self._extract_continuation(parent_renderer)
2985
2986 continuation_list = [None] # Python 2 doesnot support nonlocal
2987 parent_renderer = (
2988 try_get(tab, lambda x: x['sectionListRenderer'], dict)
2989 or try_get(tab, lambda x: x['richGridRenderer'], dict) or {})
2990 if parent_renderer:
2991 for entry in extract_entries(parent_renderer):
2992 yield entry
8bdd16b4 2993
3462ffa8 2994 continuation = continuation_list[0]
8bdd16b4 2995
2996 headers = {
2997 'x-youtube-client-name': '1',
2998 'x-youtube-client-version': '2.20201112.04.01',
2999 }
3000 if identity_token:
3001 headers['x-youtube-identity-token'] = identity_token
ebf1b291 3002
8bdd16b4 3003 for page_num in itertools.count(1):
3004 if not continuation:
3005 break
3462ffa8 3006 if hasattr(self, '_MAX_PAGES') and page_num > self._MAX_PAGES:
3007 break
8bdd16b4 3008 browse = self._download_json(
3009 'https://www.youtube.com/browse_ajax', None,
3010 'Downloading page %d' % page_num,
3011 headers=headers, query=continuation, fatal=False)
3012 if not browse:
3013 break
3014 response = try_get(browse, lambda x: x[1]['response'], dict)
3015 if not response:
3016 break
ebf1b291 3017
8bdd16b4 3018 continuation_contents = try_get(
3019 response, lambda x: x['continuationContents'], dict)
3020 if continuation_contents:
3021 continuation_renderer = continuation_contents.get('playlistVideoListContinuation')
3022 if continuation_renderer:
3023 for entry in self._playlist_entries(continuation_renderer):
3024 yield entry
3025 continuation = self._extract_continuation(continuation_renderer)
3026 continue
3027 continuation_renderer = continuation_contents.get('gridContinuation')
3028 if continuation_renderer:
3029 for entry in self._grid_entries(continuation_renderer):
3030 yield entry
3031 continuation = self._extract_continuation(continuation_renderer)
3032 continue
3033 continuation_renderer = continuation_contents.get('itemSectionContinuation')
3034 if continuation_renderer:
3035 for entry in self._post_thread_continuation_entries(continuation_renderer):
3036 yield entry
3037 continuation = self._extract_continuation(continuation_renderer)
3038 continue
3462ffa8 3039 continuation_renderer = continuation_contents.get('sectionListContinuation')
3040 if continuation_renderer:
3041 continuation_list = [None]
3042 for entry in extract_entries(continuation_renderer):
3043 yield entry
3044 continuation = continuation_list[0]
3045 continue
c5e8d7af 3046
8bdd16b4 3047 continuation_items = try_get(
3048 response, lambda x: x['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems'], list)
3049 if continuation_items:
3050 continuation_item = continuation_items[0]
3051 if not isinstance(continuation_item, dict):
3052 continue
3053 renderer = continuation_item.get('playlistVideoRenderer')
3054 if renderer:
3055 video_list_renderer = {'contents': continuation_items}
3056 for entry in self._playlist_entries(video_list_renderer):
3057 yield entry
3058 continuation = self._extract_continuation(video_list_renderer)
3059 continue
3462ffa8 3060 renderer = continuation_item.get('itemSectionRenderer')
3061 if renderer:
3062 for entry in self._itemSection_entries(renderer):
3063 yield entry
3064 continuation = self._extract_continuation({'contents': continuation_items})
3065 continue
8bdd16b4 3066 break
9558dcec 3067
8bdd16b4 3068 @staticmethod
3069 def _extract_selected_tab(tabs):
3070 for tab in tabs:
3071 if try_get(tab, lambda x: x['tabRenderer']['selected'], bool):
3072 return tab['tabRenderer']
2b3c2546 3073 else:
8bdd16b4 3074 raise ExtractorError('Unable to find selected tab')
b82f815f 3075
8bdd16b4 3076 @staticmethod
3077 def _extract_uploader(data):
3078 uploader = {}
3079 sidebar_renderer = try_get(
3080 data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list)
3081 if sidebar_renderer:
3082 for item in sidebar_renderer:
3083 if not isinstance(item, dict):
3084 continue
3085 renderer = item.get('playlistSidebarSecondaryInfoRenderer')
3086 if not isinstance(renderer, dict):
3087 continue
3088 owner = try_get(
3089 renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict)
3090 if owner:
3091 uploader['uploader'] = owner.get('text')
3092 uploader['uploader_id'] = try_get(
3093 owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str)
3094 uploader['uploader_url'] = urljoin(
3095 'https://www.youtube.com/',
3096 try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
3097 return uploader
3098
3099 def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
3100 selected_tab = self._extract_selected_tab(tabs)
3101 renderer = try_get(
3102 data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
3462ffa8 3103 playlist_id = None
8bdd16b4 3104 if renderer:
3105 channel_title = renderer.get('title') or item_id
3106 tab_title = selected_tab.get('title')
3107 title = channel_title or item_id
3108 if tab_title:
3109 title += ' - %s' % tab_title
3110 description = renderer.get('description')
3111 playlist_id = renderer.get('externalId')
3112 renderer = try_get(
3113 data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
3114 if renderer:
3115 title = renderer.get('title')
3116 description = None
3117 playlist_id = item_id
3462ffa8 3118 if playlist_id is None:
3119 return None
8bdd16b4 3120 playlist = self.playlist_result(
3121 self._entries(selected_tab['content'], identity_token),
3122 playlist_id=playlist_id, playlist_title=title,
3123 playlist_description=description)
3124 playlist.update(self._extract_uploader(data))
3125 return playlist
73c4ac2c 3126
8bdd16b4 3127 def _extract_from_playlist(self, item_id, data, playlist):
3128 title = playlist.get('title') or try_get(
3129 data, lambda x: x['titleText']['simpleText'], compat_str)
3130 playlist_id = playlist.get('playlistId') or item_id
3131 return self.playlist_result(
3132 self._playlist_entries(playlist), playlist_id=playlist_id,
3133 playlist_title=title)
c5e8d7af 3134
8bdd16b4 3135 def _real_extract(self, url):
3136 item_id = self._match_id(url)
3137 url = compat_urlparse.urlunparse(
3138 compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
3139 # Handle both video/playlist URLs
3140 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3141 video_id = qs.get('v', [None])[0]
3142 playlist_id = qs.get('list', [None])[0]
3143 if video_id and playlist_id:
3144 if self._downloader.params.get('noplaylist'):
3145 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
3146 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
3147 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
3148 webpage = self._download_webpage(url, item_id)
3149 identity_token = self._search_regex(
3150 r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
3151 'identity token', default=None)
3152 data = self._extract_yt_initial_data(item_id, webpage)
3153 tabs = try_get(
3154 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
3155 if tabs:
3156 return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token)
3157 playlist = try_get(
3158 data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
3159 if playlist:
3160 return self._extract_from_playlist(item_id, data, playlist)
3161 # Fallback to video extraction if no playlist alike page is recognized
3162 if video_id:
3163 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
3164 # Failed to recognize
3165 raise ExtractorError('Unable to recognize tab page')
c5e8d7af 3166
c5e8d7af 3167
8bdd16b4 3168class YoutubePlaylistIE(InfoExtractor):
3169 IE_DESC = 'YouTube.com playlists'
3170 _VALID_URL = r'''(?x)(?:
3171 (?:https?://)?
3172 (?:\w+\.)?
3173 (?:
3174 (?:
3175 youtube(?:kids)?\.com|
3176 invidio\.us|
3177 youtu\.be
3178 )
3179 /.*?\?.*?\blist=
3180 )?
3181 (?P<id>%(playlist_id)s)
3182 )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
3183 IE_NAME = 'youtube:playlist'
cdc628a4 3184 _TESTS = [{
8bdd16b4 3185 'note': 'issue #673',
3186 'url': 'PLBB231211A4F62143',
cdc628a4 3187 'info_dict': {
8bdd16b4 3188 'title': '[OLD]Team Fortress 2 (Class-based LP)',
3189 'id': 'PLBB231211A4F62143',
3190 'uploader': 'Wickydoo',
3191 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
3192 },
3193 'playlist_mincount': 29,
3194 }, {
3195 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
3196 'info_dict': {
3197 'title': 'YDL_safe_search',
3198 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
3199 },
3200 'playlist_count': 2,
3201 'skip': 'This playlist is private',
9558dcec 3202 }, {
8bdd16b4 3203 'note': 'embedded',
3204 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
3205 'playlist_count': 4,
9558dcec 3206 'info_dict': {
8bdd16b4 3207 'title': 'JODA15',
3208 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
3209 'uploader': 'milan',
3210 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
9558dcec 3211 }
cdc628a4 3212 }, {
8bdd16b4 3213 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
3214 'playlist_mincount': 982,
3215 'info_dict': {
3216 'title': '2018 Chinese New Singles (11/6 updated)',
3217 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
3218 'uploader': 'LBK',
3219 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
3220 }
daa0df9e 3221 }, {
8bdd16b4 3222 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
3223 'info_dict': {
3224 'id': 'yeWKywCrFtk',
3225 'ext': 'mp4',
3226 'title': 'Small Scale Baler and Braiding Rugs',
3227 'uploader': 'Backus-Page House Museum',
3228 'uploader_id': 'backuspagemuseum',
3229 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
3230 'upload_date': '20161008',
3231 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
3232 'categories': ['Nonprofits & Activism'],
3233 'tags': list,
3234 'like_count': int,
3235 'dislike_count': int,
3236 },
3237 'params': {
3238 'noplaylist': True,
3239 'skip_download': True,
3240 },
39e7107d 3241 }, {
8bdd16b4 3242 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
39e7107d 3243 'only_matching': True,
9558dcec 3244 }, {
8bdd16b4 3245 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
9558dcec 3246 'only_matching': True,
73c4ac2c 3247 }, {
8bdd16b4 3248 # music album playlist
3249 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
73c4ac2c 3250 'only_matching': True,
cdc628a4
PH
3251 }]
3252
e3ea4790 3253 @classmethod
f4b05232 3254 def suitable(cls, url):
8bdd16b4 3255 return False if YoutubeTabIE.suitable(url) else super(
3256 YoutubePlaylistIE, cls).suitable(url)
f4b05232 3257
8bdd16b4 3258 def _real_extract(self, url):
3259 playlist_id = self._match_id(url)
3260 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3261 if not qs:
3262 qs = {'list': playlist_id}
3263 return self.url_result(
3264 update_url_query('https://www.youtube.com/playlist', qs),
3265 ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
3266
3267
3268class YoutubeYtUserIE(InfoExtractor):
3269 _VALID_URL = r'ytuser:(?P<id>.+)'
3270 _TESTS = [{
3271 'url': 'ytuser:phihag',
3272 'only_matching': True,
3273 }]
3274
3275 def _real_extract(self, url):
3276 user_id = self._match_id(url)
3277 return self.url_result(
3278 'https://www.youtube.com/user/%s' % user_id,
3279 ie=YoutubeTabIE.ie_key(), video_id=user_id)
9558dcec 3280
b05654f0 3281
f07e276a
S
3282class YoutubeLiveIE(YoutubeBaseInfoExtractor):
3283 IE_DESC = 'YouTube.com live streams'
3462ffa8 3284 _VALID_URL = r'(?P<base_url>%s)/live' % YoutubeTabIE._VALID_URL
f07e276a
S
3285 IE_NAME = 'youtube:live'
3286
3287 _TESTS = [{
2d3d2997 3288 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
f07e276a
S
3289 'info_dict': {
3290 'id': 'a48o2S1cPoo',
3291 'ext': 'mp4',
3292 'title': 'The Young Turks - Live Main Show',
3293 'uploader': 'The Young Turks',
3294 'uploader_id': 'TheYoungTurks',
ec85ded8 3295 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
f07e276a
S
3296 'upload_date': '20150715',
3297 'license': 'Standard YouTube License',
3298 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
3299 'categories': ['News & Politics'],
3300 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
3301 'like_count': int,
3302 'dislike_count': int,
3303 },
3304 'params': {
3305 'skip_download': True,
3306 },
3307 }, {
2d3d2997 3308 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
f07e276a 3309 'only_matching': True,
c1b2a085
S
3310 }, {
3311 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
3312 'only_matching': True,
073d5bf5
S
3313 }, {
3314 'url': 'https://www.youtube.com/TheYoungTurks/live',
3315 'only_matching': True,
f07e276a
S
3316 }]
3317
3318 def _real_extract(self, url):
3319 mobj = re.match(self._VALID_URL, url)
3320 channel_id = mobj.group('id')
3321 base_url = mobj.group('base_url')
3322 webpage = self._download_webpage(url, channel_id, fatal=False)
3323 if webpage:
3324 page_type = self._og_search_property(
e7f3529f 3325 'type', webpage, 'page type', default='')
f07e276a
S
3326 video_id = self._html_search_meta(
3327 'videoId', webpage, 'video id', default=None)
e7f3529f
S
3328 if page_type.startswith('video') and video_id and re.match(
3329 r'^[0-9A-Za-z_-]{11}$', video_id):
f07e276a
S
3330 return self.url_result(video_id, YoutubeIE.ie_key())
3331 return self.url_result(base_url)
3332
3333
8bdd16b4 3334class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):
78caa52a 3335 IE_DESC = 'YouTube.com searches'
b4c08069
JMF
3336 # there doesn't appear to be a real limit, for example if you search for
3337 # 'python' you get more than 8.000.000 results
3338 _MAX_RESULTS = float('inf')
78caa52a 3339 IE_NAME = 'youtube:search'
b05654f0 3340 _SEARCH_KEY = 'ytsearch'
6c894ea1 3341 _SEARCH_PARAMS = None
9dd8e46a 3342 _TESTS = []
b05654f0 3343
6c894ea1
U
3344 def _entries(self, query, n):
3345 data = {
3346 'context': {
3347 'client': {
3348 'clientName': 'WEB',
3349 'clientVersion': '2.20201021.03.00',
3350 }
3351 },
3352 'query': query,
a22b2fd1 3353 }
6c894ea1
U
3354 if self._SEARCH_PARAMS:
3355 data['params'] = self._SEARCH_PARAMS
3356 total = 0
3357 for page_num in itertools.count(1):
3358 search = self._download_json(
3359 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
3360 video_id='query "%s"' % query,
3361 note='Downloading page %s' % page_num,
3362 errnote='Unable to download API page', fatal=False,
3363 data=json.dumps(data).encode('utf8'),
3364 headers={'content-type': 'application/json'})
3365 if not search:
b4c08069 3366 break
6c894ea1
U
3367 slr_contents = try_get(
3368 search,
3369 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
3370 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
3371 list)
3372 if not slr_contents:
a22b2fd1 3373 break
6c894ea1
U
3374 isr_contents = try_get(
3375 slr_contents,
3376 lambda x: x[0]['itemSectionRenderer']['contents'],
3377 list)
3378 if not isr_contents:
3379 break
3380 for content in isr_contents:
3381 if not isinstance(content, dict):
3382 continue
3383 video = content.get('videoRenderer')
3384 if not isinstance(video, dict):
3385 continue
3386 video_id = video.get('videoId')
3387 if not video_id:
3388 continue
3389 title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str)
3390 description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str)
3391 duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str))
3392 view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
3393 view_count = int_or_none(self._search_regex(
3394 r'^(\d+)', re.sub(r'\s', '', view_count_text),
3395 'view count', default=None))
3396 uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
3397 total += 1
3398 yield {
3399 '_type': 'url_transparent',
3400 'ie_key': YoutubeIE.ie_key(),
3401 'id': video_id,
3402 'url': video_id,
3403 'title': title,
3404 'description': description,
3405 'duration': duration,
3406 'view_count': view_count,
3407 'uploader': uploader,
3408 }
3409 if total == n:
3410 return
3411 token = try_get(
3412 slr_contents,
3413 lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
3414 compat_str)
3415 if not token:
3416 break
3417 data['continuation'] = token
b05654f0 3418
6c894ea1
U
3419 def _get_n_results(self, query, n):
3420 """Get a specified number of results for a query"""
3421 return self.playlist_result(self._entries(query, n), query)
75dff0ee 3422
c9ae7b95 3423
a3dd9248 3424class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 3425 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248 3426 _SEARCH_KEY = 'ytsearchdate'
78caa52a 3427 IE_DESC = 'YouTube.com searches, newest videos first'
6c894ea1 3428 _SEARCH_PARAMS = 'CAI%3D'
75dff0ee 3429
c9ae7b95 3430
3462ffa8 3431class YoutubeSearchURLIE(InfoExtractor):
3432 IE_DESC = 'YouTube.com search URLs'
3433 IE_NAME = 'youtube:search_url'
3434 _PARAM_REGEX = r''
3435 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results/?(?:\?|\?[^#]*?&)(?:sp=(?P<param1>[^&#]+)&(?:[^#]*&)?)?(?:q|search_query)=(?P<query>[^#&]+)(?:[^#]*?&sp=(?P<param2>[^#&]+))?'
3436 _MAX_RESULTS = 100
3437 _TESTS = [{
3438 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
3439 'playlist_mincount': 5,
3440 'info_dict': {
3441 'title': 'youtube-dl test video',
3442 }
3443 }, {
3444 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
3445 'only_matching': True,
3446 }]
3447
3448 def _real_extract(self, url):
3449 mobj = re.match(self._VALID_URL, url)
3450 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
3451 IE = YoutubeSearchIE(self._downloader)
3452 IE._SEARCH_PARAMS = mobj.group('param1') or mobj.group('param2')
3453 self._downloader.to_screen(IE._SEARCH_PARAMS)
3454 IE._MAX_RESULTS = self._MAX_RESULTS
3455 return IE._get_n_results(query, self._MAX_RESULTS)
3456
3457
3458class YoutubeFeedsInfoExtractor(YoutubeTabIE):
d7ae0639 3459 """
25f14e9f 3460 Base class for feed extractors
d7ae0639
JMF
3461 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
3462 """
b2e8bc1b 3463 _LOGIN_REQUIRED = True
3462ffa8 3464 _TESTS = []
3465
3466 # _MAX_PAGES = 5
d7ae0639
JMF
3467
3468 @property
3469 def IE_NAME(self):
78caa52a 3470 return 'youtube:%s' % self._FEED_NAME
04cc9617 3471
81f0259b 3472 def _real_initialize(self):
b2e8bc1b 3473 self._login()
81f0259b 3474
3462ffa8 3475 def _shelf_entries(self, shelf_renderer):
3476 renderer = try_get(shelf_renderer, lambda x: x['content']['gridRenderer'], dict)
3477 if not renderer:
3478 return
3479 for entry in self._grid_entries(renderer):
3480 yield entry
8bdd16b4 3481
3462ffa8 3482 def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
3483 selected_tab = self._extract_selected_tab(tabs)
3484 return self.playlist_result(
3485 self._entries(selected_tab['content'], identity_token),
3486 playlist_title=self._PLAYLIST_TITLE)
2bc43303 3487
3853309f 3488 def _real_extract(self, url):
3462ffa8 3489 item_id = self._FEED_NAME
3490 url = 'https://www.youtube.com/feed/%s' % self._FEED_NAME
3491 webpage = self._download_webpage(url, item_id)
3492 identity_token = self._search_regex(
3493 r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
3494 'identity token', default=None)
3495 data = self._extract_yt_initial_data(item_id, webpage)
3496 tabs = try_get(
3497 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
3498 if tabs:
3499 return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token)
3500 # Failed to recognize
3501 raise ExtractorError('Unable to recognize feed page')
25f14e9f
S
3502
3503
3462ffa8 3504class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
3505 IE_DESC = 'Youtube watch later list, ":ytwatchlater" or "WL" for short (requires authentication)'
8bdd16b4 3506 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/watch_later|:ytwatchlater'
3462ffa8 3507 _FEED_NAME = 'watchlater'
25f14e9f 3508
bc7a9cd8 3509 _TESTS = [{
8bdd16b4 3510 'url': 'https://www.youtube.com/feed/watch_later',
bc7a9cd8
S
3511 'only_matching': True,
3512 }, {
8bdd16b4 3513 'url': ':ytwatchlater',
bc7a9cd8
S
3514 'only_matching': True,
3515 }]
25f14e9f
S
3516
3517 def _real_extract(self, url):
3462ffa8 3518 return self.url_result('WL', ie=YoutubePlaylistIE.ie_key())
3519
3520
3521class YoutubeFavouritesIE(YoutubeFeedsInfoExtractor):
3522 IE_DESC = 'YouTube.com liked videos, ":ytfav" or "LL" for short (requires authentication)'
3523 _VALID_URL = r':ytfav(?:ou?rite)s?'
3524 _FEED_NAME = 'favourites'
3525
3526 _TESTS = [{
3527 'url': ':ytfav',
3528 'only_matching': True,
3529 }]
3530
3531 def _real_extract(self, url):
3532 return self.url_result('LL', ie=YoutubePlaylistIE.ie_key())
f459d170 3533
5f6a1245 3534
25f14e9f
S
3535class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3536 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
3462ffa8 3537 _VALID_URL = r'https?://(?:www\.)?youtube\.com(?:/feed/recommended|/?[?#]|/?$)|:ytrec(?:ommended)?'
25f14e9f
S
3538 _FEED_NAME = 'recommended'
3539 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1ed5b5c9 3540
1ed5b5c9 3541
25f14e9f
S
3542class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
3543 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
3462ffa8 3544 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsub(?:scription)?s?'
25f14e9f
S
3545 _FEED_NAME = 'subscriptions'
3546 _PLAYLIST_TITLE = 'Youtube Subscriptions'
1ed5b5c9 3547
1ed5b5c9 3548
25f14e9f
S
3549class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
3550 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
92519402 3551 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
25f14e9f
S
3552 _FEED_NAME = 'history'
3553 _PLAYLIST_TITLE = 'Youtube History'
1ed5b5c9
JMF
3554
3555
15870e90
PH
3556class YoutubeTruncatedURLIE(InfoExtractor):
3557 IE_NAME = 'youtube:truncated_url'
3558 IE_DESC = False # Do not list
975d35db 3559 _VALID_URL = r'''(?x)
b95aab84
PH
3560 (?:https?://)?
3561 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3562 (?:watch\?(?:
c4808c60 3563 feature=[a-z_]+|
b95aab84
PH
3564 annotation_id=annotation_[^&]+|
3565 x-yt-cl=[0-9]+|
c1708b89 3566 hl=[^&]*|
287be8c6 3567 t=[0-9]+
b95aab84
PH
3568 )?
3569 |
3570 attribution_link\?a=[^&]+
3571 )
3572 $
975d35db 3573 '''
15870e90 3574
c4808c60 3575 _TESTS = [{
2d3d2997 3576 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
c4808c60 3577 'only_matching': True,
dc2fc736 3578 }, {
2d3d2997 3579 'url': 'https://www.youtube.com/watch?',
dc2fc736 3580 'only_matching': True,
b95aab84
PH
3581 }, {
3582 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3583 'only_matching': True,
3584 }, {
3585 'url': 'https://www.youtube.com/watch?feature=foo',
3586 'only_matching': True,
c1708b89
PH
3587 }, {
3588 'url': 'https://www.youtube.com/watch?hl=en-GB',
3589 'only_matching': True,
287be8c6
PH
3590 }, {
3591 'url': 'https://www.youtube.com/watch?t=2372',
3592 'only_matching': True,
c4808c60
PH
3593 }]
3594
15870e90
PH
3595 def _real_extract(self, url):
3596 raise ExtractorError(
78caa52a
PH
3597 'Did you forget to quote the URL? Remember that & is a meta '
3598 'character in most shells, so you want to put the URL in quotes, '
3867038a 3599 'like youtube-dl '
2d3d2997 3600 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3867038a 3601 ' or simply youtube-dl BaW_jenozKc .',
15870e90 3602 expected=True)
772fd5cc
PH
3603
3604
3605class YoutubeTruncatedIDIE(InfoExtractor):
3606 IE_NAME = 'youtube:truncated_id'
3607 IE_DESC = False # Do not list
b95aab84 3608 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
3609
3610 _TESTS = [{
3611 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3612 'only_matching': True,
3613 }]
3614
3615 def _real_extract(self, url):
3616 video_id = self._match_id(url)
3617 raise ExtractorError(
3618 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3619 expected=True)
8bdd16b4 3620
3621
3462ffa8 3622# Do Youtube show urls even exist anymore? I couldn't find any
3623r'''
3624class YoutubeShowIE(YoutubeTabIE):
8bdd16b4 3625 IE_DESC = 'YouTube.com (multi-season) shows'
3626 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
3627 IE_NAME = 'youtube:show'
3628 _TESTS = [{
3629 'url': 'https://www.youtube.com/show/airdisasters',
3630 'playlist_mincount': 5,
3631 'info_dict': {
3632 'id': 'airdisasters',
3633 'title': 'Air Disasters',
3634 }
3635 }]
3636
3637 def _real_extract(self, url):
3638 playlist_id = self._match_id(url)
3639 return super(YoutubeShowIE, self)._real_extract(
3640 'https://www.youtube.com/show/%s/playlists' % playlist_id)
3462ffa8 3641'''