]> jfr.im git - yt-dlp.git/blame - youtube_dlc/extractor/youtube.py
Updated to release 2020.11.21.1
[yt-dlp.git] / youtube_dlc / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
d77ab8e2 9import random
c5e8d7af 10import re
42939b61 11import time
e0df6211 12import traceback
c5e8d7af 13
b05654f0 14from .common import InfoExtractor, SearchInfoExtractor
2b25cb5d 15from ..jsinterp import JSInterpreter
54256267 16from ..swfinterp import SWFInterpreter
4bb4a188 17from ..compat import (
edf3e38e 18 compat_chr,
8d81f3e3 19 compat_kwargs,
c5e8d7af 20 compat_parse_qs,
7fd002c0
S
21 compat_urllib_parse_unquote,
22 compat_urllib_parse_unquote_plus,
15707c7e 23 compat_urllib_parse_urlencode,
7c80519c 24 compat_urllib_parse_urlparse,
7c61bd36 25 compat_urlparse,
c5e8d7af 26 compat_str,
4bb4a188
PH
27)
28from ..utils import (
27019dbb 29 bool_or_none,
c5e8d7af 30 clean_html,
9b9c5355 31 error_to_compat_str,
c5e8d7af 32 ExtractorError,
2d30521a 33 float_or_none,
4bb4a188 34 get_element_by_id,
dd27fd17 35 int_or_none,
94278f72 36 mimetype2ext,
6310acf5 37 parse_codecs,
b84071c0 38 parse_count,
7c80519c 39 parse_duration,
0cb58b02 40 remove_quotes,
3995d37d 41 remove_start,
cf7e015f 42 smuggle_url,
dbdaaa23 43 str_or_none,
c93d53f5 44 str_to_int,
556dbe7f 45 try_get,
c5e8d7af
PH
46 unescapeHTML,
47 unified_strdate,
cf7e015f 48 unsmuggle_url,
8bdd16b4 49 update_url_query,
81c2f20b 50 uppercase_escape,
21c340b8 51 url_or_none,
6e6bc8da 52 urlencode_postdata,
8bdd16b4 53 urljoin,
c5e8d7af
PH
54)
55
5f6a1245 56
de7f3446 57class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
58 """Provide base functions for Youtube extractors"""
59 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
9303ce3e 60 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
e00eb564
S
61
62 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
3995d37d
S
63 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
64 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
e00eb564 65
3462ffa8 66 _RESERVED_NAMES = (
67 r'course|embed|watch|w|results|storefront|'
68 r'shared|index|account|reporthistory|t/terms|about|upload|signin|logout|'
69 r'feed/(watch_later|history|subscriptions|library|trending|recommended)')
70
b2e8bc1b
JMF
71 _NETRC_MACHINE = 'youtube'
72 # If True it will raise an error if no login info is provided
73 _LOGIN_REQUIRED = False
74
3462ffa8 75 _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|LL|WL)'
d0ba5587 76
d84b21b4
S
77 _YOUTUBE_CLIENT_HEADERS = {
78 'x-youtube-client-name': '1',
79 'x-youtube-client-version': '1.20200609.04.02',
80 }
81
b2e8bc1b 82 def _set_language(self):
810fb84d 83 self._set_cookie(
ee0b726c 84 '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
42939b61 85 # YouTube sets the expire time to about two months
810fb84d 86 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b 87
25f14e9f
S
88 def _ids_to_results(self, ids):
89 return [
90 self.url_result(vid_id, 'Youtube', video_id=vid_id)
91 for vid_id in ids]
92
b2e8bc1b 93 def _login(self):
83317f69 94 """
95 Attempt to log in to YouTube.
96 True is returned if successful or skipped.
97 False is returned if login failed.
98
99 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
100 """
68217024 101 username, password = self._get_login_info()
b2e8bc1b
JMF
102 # No authentication to be performed
103 if username is None:
70d35d16 104 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
69ea8ca4 105 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
957c523e
U
106 if self._downloader.params.get('cookiefile') and False: # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
107 self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
83317f69 108 return True
b2e8bc1b 109
7cc3570e
PH
110 login_page = self._download_webpage(
111 self._LOGIN_URL, None,
69ea8ca4
PH
112 note='Downloading login page',
113 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
114 if login_page is False:
115 return
b2e8bc1b 116
1212e997 117 login_form = self._hidden_inputs(login_page)
c5e8d7af 118
e00eb564
S
119 def req(url, f_req, note, errnote):
120 data = login_form.copy()
121 data.update({
122 'pstMsg': 1,
123 'checkConnection': 'youtube',
124 'checkedDomains': 'youtube',
125 'hl': 'en',
126 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
3995d37d 127 'f.req': json.dumps(f_req),
e00eb564
S
128 'flowName': 'GlifWebSignIn',
129 'flowEntry': 'ServiceLogin',
baf67a60
S
130 # TODO: reverse actual botguard identifier generation algo
131 'bgRequest': '["identifier",""]',
041bc3ad 132 })
e00eb564
S
133 return self._download_json(
134 url, None, note=note, errnote=errnote,
135 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
136 fatal=False,
137 data=urlencode_postdata(data), headers={
138 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
139 'Google-Accounts-XSRF': 1,
140 })
141
3995d37d
S
142 def warn(message):
143 self._downloader.report_warning(message)
144
145 lookup_req = [
146 username,
147 None, [], None, 'US', None, None, 2, False, True,
148 [
149 None, None,
150 [2, 1, None, 1,
151 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
152 None, [], 4],
153 1, [None, None, []], None, None, None, True
154 ],
155 username,
156 ]
157
e00eb564 158 lookup_results = req(
3995d37d 159 self._LOOKUP_URL, lookup_req,
e00eb564
S
160 'Looking up account info', 'Unable to look up account info')
161
162 if lookup_results is False:
163 return False
041bc3ad 164
3995d37d
S
165 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
166 if not user_hash:
167 warn('Unable to extract user hash')
168 return False
169
170 challenge_req = [
171 user_hash,
172 None, 1, None, [1, None, None, None, [password, None, True]],
173 [
174 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
175 1, [None, None, []], None, None, None, True
176 ]]
83317f69 177
3995d37d
S
178 challenge_results = req(
179 self._CHALLENGE_URL, challenge_req,
180 'Logging in', 'Unable to log in')
83317f69 181
3995d37d 182 if challenge_results is False:
e00eb564 183 return
83317f69 184
3995d37d
S
185 login_res = try_get(challenge_results, lambda x: x[0][5], list)
186 if login_res:
187 login_msg = try_get(login_res, lambda x: x[5], compat_str)
188 warn(
189 'Unable to login: %s' % 'Invalid password'
190 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
191 return False
192
193 res = try_get(challenge_results, lambda x: x[0][-1], list)
194 if not res:
195 warn('Unable to extract result entry')
196 return False
197
9a6628aa
S
198 login_challenge = try_get(res, lambda x: x[0][0], list)
199 if login_challenge:
200 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
201 if challenge_str == 'TWO_STEP_VERIFICATION':
3995d37d
S
202 # SEND_SUCCESS - TFA code has been successfully sent to phone
203 # QUOTA_EXCEEDED - reached the limit of TFA codes
9a6628aa 204 status = try_get(login_challenge, lambda x: x[5], compat_str)
3995d37d
S
205 if status == 'QUOTA_EXCEEDED':
206 warn('Exceeded the limit of TFA codes, try later')
207 return False
208
209 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
210 if not tl:
211 warn('Unable to extract TL')
212 return False
213
214 tfa_code = self._get_tfa_info('2-step verification code')
215
216 if not tfa_code:
217 warn(
218 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
219 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
220 return False
221
222 tfa_code = remove_start(tfa_code, 'G-')
223
224 tfa_req = [
225 user_hash, None, 2, None,
226 [
227 9, None, None, None, None, None, None, None,
228 [None, tfa_code, True, 2]
229 ]]
230
231 tfa_results = req(
232 self._TFA_URL.format(tl), tfa_req,
233 'Submitting TFA code', 'Unable to submit TFA code')
234
235 if tfa_results is False:
236 return False
237
238 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
239 if tfa_res:
240 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
241 warn(
242 'Unable to finish TFA: %s' % 'Invalid TFA code'
243 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
244 return False
245
246 check_cookie_url = try_get(
247 tfa_results, lambda x: x[0][-1][2], compat_str)
9a6628aa
S
248 else:
249 CHALLENGES = {
250 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
251 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
252 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
253 }
254 challenge = CHALLENGES.get(
255 challenge_str,
256 '%s returned error %s.' % (self.IE_NAME, challenge_str))
257 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
258 return False
3995d37d
S
259 else:
260 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
261
262 if not check_cookie_url:
263 warn('Unable to extract CheckCookie URL')
264 return False
e00eb564
S
265
266 check_cookie_results = self._download_webpage(
3995d37d
S
267 check_cookie_url, None, 'Checking cookie', fatal=False)
268
269 if check_cookie_results is False:
270 return False
e00eb564 271
3995d37d
S
272 if 'https://myaccount.google.com/' not in check_cookie_results:
273 warn('Unable to log in')
b2e8bc1b 274 return False
e00eb564 275
b2e8bc1b
JMF
276 return True
277
30226342 278 def _download_webpage_handle(self, *args, **kwargs):
c1148516 279 query = kwargs.get('query', {}).copy()
c1148516 280 kwargs['query'] = query
30226342 281 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
8d81f3e3
S
282 *args, **compat_kwargs(kwargs))
283
5b0a6a80 284 def _get_yt_initial_data(self, video_id, webpage):
285 config = self._search_regex(
286 (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
287 r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
288 webpage, 'ytInitialData', default=None)
289 if config:
290 return self._parse_json(
291 uppercase_escape(config), video_id, fatal=False)
292
b2e8bc1b
JMF
293 def _real_initialize(self):
294 if self._downloader is None:
295 return
42939b61 296 self._set_language()
b2e8bc1b
JMF
297 if not self._login():
298 return
c5e8d7af 299
8bdd16b4 300 _DEFAULT_API_DATA = {
301 'context': {
302 'client': {
303 'clientName': 'WEB',
304 'clientVersion': '2.20201021.03.00',
305 }
306 },
307 }
8377574c 308
a0566bbf 309 _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
310
8bdd16b4 311 def _call_api(self, ep, query, video_id):
312 data = self._DEFAULT_API_DATA.copy()
313 data.update(query)
9833e7a0 314
8bdd16b4 315 response = self._download_json(
316 'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id,
317 note='Downloading API JSON', errnote='Unable to download API page',
318 data=json.dumps(data).encode('utf8'),
319 headers={'content-type': 'application/json'},
320 query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'})
c54f4aad 321
8bdd16b4 322 return response
061a75ed 323
8bdd16b4 324 def _extract_yt_initial_data(self, video_id, webpage):
325 return self._parse_json(
326 self._search_regex(
a0566bbf 327 (r'%s\s*\n' % self._YT_INITIAL_DATA_RE,
328 self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
8bdd16b4 329 video_id)
0c148415
S
330
331
360e1ca5 332class YoutubeIE(YoutubeBaseInfoExtractor):
78caa52a 333 IE_DESC = 'YouTube.com'
cb7dfeea 334 _VALID_URL = r"""(?x)^
c5e8d7af 335 (
edb53e2d 336 (?:https?://|//) # http(s):// or protocol-independent URL
66b48727 337 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
484aaeb2 338 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 339 (?:www\.)?pwnyoutube\.com/|
8b561bfc 340 (?:www\.)?hooktube\.com/|
f7000f3a 341 (?:www\.)?yourepeat\.com/|
e69ae5b9 342 tube\.majestyc\.net/|
ba036333 343 # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
77d95677 344 (?:(?:www|dev)\.)?invidio\.us/|
ba036333 345 (?:(?:www|no)\.)?invidiou\.sh/|
346 (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/|
8ae113ca 347 (?:www\.)?invidious\.kabi\.tk/|
ba036333 348 (?:www\.)?invidious\.13ad\.de/|
791d2e81 349 (?:www\.)?invidious\.mastodon\.host/|
494d664e 350 (?:www\.)?invidious\.nixnet\.xyz/|
666d808e 351 (?:www\.)?invidious\.drycat\.fr/|
ba036333 352 (?:www\.)?tube\.poal\.co/|
8ae113ca 353 (?:www\.)?vid\.wxzm\.sx/|
384bf91f 354 (?:www\.)?yewtu\.be/|
494d664e 355 (?:www\.)?yt\.elukerio\.org/|
894b3826 356 (?:www\.)?yt\.lelux\.fi/|
1db5ab6b 357 (?:www\.)?invidious\.ggc-project\.de/|
358 (?:www\.)?yt\.maisputain\.ovh/|
359 (?:www\.)?invidious\.13ad\.de/|
360 (?:www\.)?invidious\.toot\.koeln/|
361 (?:www\.)?invidious\.fdn\.fr/|
362 (?:www\.)?watch\.nettohikari\.com/|
bff90fc5 363 (?:www\.)?kgg2m7yk5aybusll\.onion/|
364 (?:www\.)?qklhadlycap4cnod\.onion/|
365 (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
366 (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
367 (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
368 (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
33c1c7d8 369 (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
1db5ab6b 370 (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
e69ae5b9 371 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
372 (?:.*?\#/)? # handle anchor (#/) redirect urls
373 (?: # the various things that can precede the ID:
ac7553d0 374 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 375 |(?: # or the v= param in all its forms
f7000f3a 376 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af 377 (?:\?|\#!?) # the params delimiter ? or # or #!
040ac686 378 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
c5e8d7af
PH
379 v=
380 )
f4b05232 381 ))
cbaed4bb
S
382 |(?:
383 youtu\.be| # just youtu.be/xxxx
6d4fc66b
S
384 vid\.plus| # or vid.plus/xxxx
385 zwearz\.com/watch| # or zwearz.com/watch/xxxx
cbaed4bb 386 )/
edb53e2d 387 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 388 )
c5e8d7af 389 )? # all until now is optional -> you can pass the naked ID
8bdd16b4 390 (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
d0ba5587
S
391 (?!.*?\blist=
392 (?:
393 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
394 WL # WL are handled by the watch later IE
395 )
396 )
c5e8d7af 397 (?(1).+)? # if we found the ID, everything can follow
d0ba5587 398 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
c5e8d7af 399 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
e40c758c
S
400 _PLAYER_INFO_RE = (
401 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P<ext>[a-z]+)$',
402 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.(?P<ext>[a-z]+)$',
403 )
2c62dc26 404 _formats = {
c2d3cb4c 405 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
406 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
407 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
408 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
409 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
410 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
411 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
412 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
3834d3e3 413 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
c2d3cb4c 414 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
415 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
416 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
417 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
418 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
419 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
e1a0bfdf 420 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
c2d3cb4c 421 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
422 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
e1a0bfdf 423
424
425 # 3D videos
c2d3cb4c 426 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
427 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
428 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
429 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
e1a0bfdf 430 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
431 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
432 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
836a086c 433
96fb5605 434 # Apple HTTP Live Streaming
11f12195 435 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
c2d3cb4c 436 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
437 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
438 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
439 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
440 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
e1a0bfdf 441 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
442 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
2c62dc26
PH
443
444 # DASH mp4 video
d23028a8
S
445 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
446 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
447 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
448 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
449 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
067aa17e 450 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
d23028a8
S
451 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
452 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
453 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
454 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
455 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
456 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
836a086c 457
f6f1fc92 458 # Dash mp4 audio
d23028a8
S
459 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
460 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
461 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
462 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
463 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
464 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
465 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
836a086c
AZ
466
467 # Dash webm
d23028a8
S
468 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
469 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
470 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
471 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
472 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
473 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
474 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
475 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
476 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
477 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
478 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
479 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
480 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
481 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
482 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
4c6b4764 483 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
d23028a8
S
484 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
485 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
486 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
487 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
488 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
489 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
2c62dc26
PH
490
491 # Dash webm audio
d23028a8
S
492 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
493 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
ce6b9a2d 494
0857baad 495 # Dash webm audio with opus inside
d23028a8
S
496 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
497 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
498 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
0857baad 499
ce6b9a2d
PH
500 # RTMP (unnamed)
501 '_rtmp': {'protocol': 'rtmp'},
b85eae0f
S
502
503 # av01 video only formats sometimes served with "unknown" codecs
504 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
505 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
506 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
507 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
c5e8d7af 508 }
84da5d84 509 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
836a086c 510
fd5c4aab
S
511 _GEO_BYPASS = False
512
78caa52a 513 IE_NAME = 'youtube'
2eb88d95
PH
514 _TESTS = [
515 {
2d3d2997 516 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
4bc3a23e
PH
517 'info_dict': {
518 'id': 'BaW_jenozKc',
519 'ext': 'mp4',
3867038a 520 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
4bc3a23e
PH
521 'uploader': 'Philipp Hagemeister',
522 'uploader_id': 'phihag',
ec85ded8 523 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
dd4c4492
S
524 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
525 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
4bc3a23e 526 'upload_date': '20121002',
3867038a 527 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
4bc3a23e 528 'categories': ['Science & Technology'],
3867038a 529 'tags': ['youtube-dl'],
556dbe7f 530 'duration': 10,
dbdaaa23 531 'view_count': int,
3e7c1224
PH
532 'like_count': int,
533 'dislike_count': int,
7c80519c 534 'start_time': 1,
297a564b 535 'end_time': 9,
2eb88d95 536 }
0e853ca4 537 },
fccd3771 538 {
4bc3a23e
PH
539 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
540 'note': 'Embed-only video (#1746)',
541 'info_dict': {
542 'id': 'yZIXLfi8CZQ',
543 'ext': 'mp4',
544 'upload_date': '20120608',
545 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
546 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
547 'uploader': 'SET India',
94bfcd23 548 'uploader_id': 'setindia',
ec85ded8 549 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
94bfcd23 550 'age_limit': 18,
fccd3771
PH
551 }
552 },
11b56058 553 {
8bdd16b4 554 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ',
11b56058
PM
555 'note': 'Use the first video ID in the URL',
556 'info_dict': {
557 'id': 'BaW_jenozKc',
558 'ext': 'mp4',
3867038a 559 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
11b56058
PM
560 'uploader': 'Philipp Hagemeister',
561 'uploader_id': 'phihag',
ec85ded8 562 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
11b56058 563 'upload_date': '20121002',
3867038a 564 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
11b56058 565 'categories': ['Science & Technology'],
3867038a 566 'tags': ['youtube-dl'],
556dbe7f 567 'duration': 10,
dbdaaa23 568 'view_count': int,
11b56058
PM
569 'like_count': int,
570 'dislike_count': int,
34a7de29
S
571 },
572 'params': {
573 'skip_download': True,
574 },
11b56058 575 },
dd27fd17 576 {
2d3d2997 577 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
4bc3a23e
PH
578 'note': '256k DASH audio (format 141) via DASH manifest',
579 'info_dict': {
580 'id': 'a9LDPn-MO4I',
581 'ext': 'm4a',
582 'upload_date': '20121002',
583 'uploader_id': '8KVIDEO',
ec85ded8 584 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
4bc3a23e
PH
585 'description': '',
586 'uploader': '8KVIDEO',
587 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 588 },
4bc3a23e
PH
589 'params': {
590 'youtube_include_dash_manifest': True,
591 'format': '141',
4919603f 592 },
de3c7fe0 593 'skip': 'format 141 not served anymore',
dd27fd17 594 },
8bdd16b4 595 # DASH manifest with encrypted signature
596 {
597 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
598 'info_dict': {
599 'id': 'IB3lcPjvWLA',
600 'ext': 'm4a',
601 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
602 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
603 'duration': 244,
604 'uploader': 'AfrojackVEVO',
605 'uploader_id': 'AfrojackVEVO',
606 'upload_date': '20131011',
607 },
608 'params': {
609 'youtube_include_dash_manifest': True,
610 'format': '141/bestaudio[ext=m4a]',
611 },
612 },
aa79ac0c
PH
613 # Controversy video
614 {
615 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
616 'info_dict': {
617 'id': 'T4XJQO3qol8',
618 'ext': 'mp4',
556dbe7f 619 'duration': 219,
aa79ac0c 620 'upload_date': '20100909',
4fe54c12 621 'uploader': 'Amazing Atheist',
aa79ac0c 622 'uploader_id': 'TheAmazingAtheist',
ec85ded8 623 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
aa79ac0c
PH
624 'title': 'Burning Everyone\'s Koran',
625 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
626 }
c522adb1 627 },
dd2d55f1 628 # Normal age-gate video (embed allowed)
c522adb1 629 {
2d3d2997 630 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
c522adb1
JMF
631 'info_dict': {
632 'id': 'HtVdAasjOgU',
633 'ext': 'mp4',
634 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
ec85ded8 635 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
556dbe7f 636 'duration': 142,
c522adb1
JMF
637 'uploader': 'The Witcher',
638 'uploader_id': 'WitcherGame',
ec85ded8 639 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
c522adb1 640 'upload_date': '20140605',
34952f09 641 'age_limit': 18,
c522adb1
JMF
642 },
643 },
8bdd16b4 644 # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
645 # YouTube Red ad is not captured for creator
646 {
647 'url': '__2ABJjxzNo',
648 'info_dict': {
649 'id': '__2ABJjxzNo',
650 'ext': 'mp4',
651 'duration': 266,
652 'upload_date': '20100430',
653 'uploader_id': 'deadmau5',
654 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
655 'creator': 'Dada Life, deadmau5',
656 'description': 'md5:12c56784b8032162bb936a5f76d55360',
657 'uploader': 'deadmau5',
658 'title': 'Deadmau5 - Some Chords (HD)',
659 'alt_title': 'This Machine Kills Some Chords',
660 },
661 'expected_warnings': [
662 'DASH manifest missing',
663 ]
664 },
067aa17e 665 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
e52a40ab
PH
666 {
667 'url': 'lqQg6PlCWgI',
668 'info_dict': {
669 'id': 'lqQg6PlCWgI',
670 'ext': 'mp4',
556dbe7f 671 'duration': 6085,
90227264 672 'upload_date': '20150827',
cbe2bd91 673 'uploader_id': 'olympic',
ec85ded8 674 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
cbe2bd91 675 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
be49068d 676 'uploader': 'Olympic',
cbe2bd91
PH
677 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
678 },
679 'params': {
680 'skip_download': 'requires avconv',
e52a40ab 681 }
cbe2bd91 682 },
6271f1ca
PH
683 # Non-square pixels
684 {
685 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
686 'info_dict': {
687 'id': '_b-2C3KPAM0',
688 'ext': 'mp4',
689 'stretched_ratio': 16 / 9.,
556dbe7f 690 'duration': 85,
6271f1ca
PH
691 'upload_date': '20110310',
692 'uploader_id': 'AllenMeow',
ec85ded8 693 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
6271f1ca 694 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
eb6793ba 695 'uploader': '孫ᄋᄅ',
6271f1ca
PH
696 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
697 },
06b491eb
S
698 },
699 # url_encoded_fmt_stream_map is empty string
700 {
701 'url': 'qEJwOuvDf7I',
702 'info_dict': {
703 'id': 'qEJwOuvDf7I',
f57b7835 704 'ext': 'webm',
06b491eb
S
705 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
706 'description': '',
707 'upload_date': '20150404',
708 'uploader_id': 'spbelect',
709 'uploader': 'Наблюдатели Петербурга',
710 },
711 'params': {
712 'skip_download': 'requires avconv',
e323cf3f
S
713 },
714 'skip': 'This live event has ended.',
06b491eb 715 },
067aa17e 716 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
da77d856
S
717 {
718 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
719 'info_dict': {
720 'id': 'FIl7x6_3R5Y',
eb6793ba 721 'ext': 'webm',
da77d856
S
722 'title': 'md5:7b81415841e02ecd4313668cde88737a',
723 'description': 'md5:116377fd2963b81ec4ce64b542173306',
556dbe7f 724 'duration': 220,
da77d856
S
725 'upload_date': '20150625',
726 'uploader_id': 'dorappi2000',
ec85ded8 727 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
da77d856 728 'uploader': 'dorappi2000',
eb6793ba 729 'formats': 'mincount:31',
da77d856 730 },
eb6793ba 731 'skip': 'not actual anymore',
2ee8f5d8 732 },
8a1a26ce
YCH
733 # DASH manifest with segment_list
734 {
735 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
736 'md5': '8ce563a1d667b599d21064e982ab9e31',
737 'info_dict': {
738 'id': 'CsmdDsKjzN8',
739 'ext': 'mp4',
17ee98e1 740 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
8a1a26ce
YCH
741 'uploader': 'Airtek',
742 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
743 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
744 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
745 },
746 'params': {
747 'youtube_include_dash_manifest': True,
748 'format': '135', # bestvideo
be49068d
S
749 },
750 'skip': 'This live event has ended.',
2ee8f5d8 751 },
cf7e015f
S
752 {
753 # Multifeed videos (multiple cameras), URL is for Main Camera
754 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
755 'info_dict': {
756 'id': 'jqWvoWXjCVs',
757 'title': 'teamPGP: Rocket League Noob Stream',
758 'description': 'md5:dc7872fb300e143831327f1bae3af010',
759 },
760 'playlist': [{
761 'info_dict': {
762 'id': 'jqWvoWXjCVs',
763 'ext': 'mp4',
764 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
765 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 766 'duration': 7335,
cf7e015f
S
767 'upload_date': '20150721',
768 'uploader': 'Beer Games Beer',
769 'uploader_id': 'beergamesbeer',
ec85ded8 770 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 771 'license': 'Standard YouTube License',
cf7e015f
S
772 },
773 }, {
774 'info_dict': {
775 'id': '6h8e8xoXJzg',
776 'ext': 'mp4',
777 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
778 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 779 'duration': 7337,
cf7e015f
S
780 'upload_date': '20150721',
781 'uploader': 'Beer Games Beer',
782 'uploader_id': 'beergamesbeer',
ec85ded8 783 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 784 'license': 'Standard YouTube License',
cf7e015f
S
785 },
786 }, {
787 'info_dict': {
788 'id': 'PUOgX5z9xZw',
789 'ext': 'mp4',
790 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
791 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 792 'duration': 7337,
cf7e015f
S
793 'upload_date': '20150721',
794 'uploader': 'Beer Games Beer',
795 'uploader_id': 'beergamesbeer',
ec85ded8 796 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 797 'license': 'Standard YouTube License',
cf7e015f
S
798 },
799 }, {
800 'info_dict': {
801 'id': 'teuwxikvS5k',
802 'ext': 'mp4',
803 'title': 'teamPGP: Rocket League Noob Stream (zim)',
804 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 805 'duration': 7334,
cf7e015f
S
806 'upload_date': '20150721',
807 'uploader': 'Beer Games Beer',
808 'uploader_id': 'beergamesbeer',
ec85ded8 809 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 810 'license': 'Standard YouTube License',
cf7e015f
S
811 },
812 }],
813 'params': {
814 'skip_download': True,
815 },
4fe54c12 816 'skip': 'This video is not available.',
cbaed4bb 817 },
f9f49d87 818 {
067aa17e 819 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
f9f49d87
S
820 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
821 'info_dict': {
822 'id': 'gVfLd0zydlo',
823 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
824 },
825 'playlist_count': 2,
be49068d 826 'skip': 'Not multifeed anymore',
f9f49d87 827 },
cbaed4bb 828 {
2d3d2997 829 'url': 'https://vid.plus/FlRa-iH7PGw',
cbaed4bb 830 'only_matching': True,
0e49d9a6 831 },
6d4fc66b 832 {
2d3d2997 833 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
6d4fc66b
S
834 'only_matching': True,
835 },
0e49d9a6 836 {
067aa17e 837 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
a8776b10 838 # Also tests cut-off URL expansion in video description (see
067aa17e
S
839 # https://github.com/ytdl-org/youtube-dl/issues/1892,
840 # https://github.com/ytdl-org/youtube-dl/issues/8164)
0e49d9a6
LL
841 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
842 'info_dict': {
843 'id': 'lsguqyKfVQg',
844 'ext': 'mp4',
845 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
eb6793ba 846 'alt_title': 'Dark Walk - Position Music',
0e49d9a6 847 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
556dbe7f 848 'duration': 133,
0e49d9a6
LL
849 'upload_date': '20151119',
850 'uploader_id': 'IronSoulElf',
ec85ded8 851 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
0e49d9a6 852 'uploader': 'IronSoulElf',
eb6793ba
S
853 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
854 'track': 'Dark Walk - Position Music',
855 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
92bc97d3 856 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
0e49d9a6
LL
857 },
858 'params': {
859 'skip_download': True,
860 },
861 },
61f92af1 862 {
067aa17e 863 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
61f92af1
S
864 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
865 'only_matching': True,
866 },
313dfc45
LL
867 {
868 # Video with yt:stretch=17:0
869 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
870 'info_dict': {
871 'id': 'Q39EVAstoRM',
872 'ext': 'mp4',
873 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
874 'description': 'md5:ee18a25c350637c8faff806845bddee9',
875 'upload_date': '20151107',
876 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
877 'uploader': 'CH GAMER DROID',
878 },
879 'params': {
880 'skip_download': True,
881 },
be49068d 882 'skip': 'This video does not exist.',
313dfc45 883 },
7caf9830
S
884 {
885 # Video licensed under Creative Commons
886 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
887 'info_dict': {
888 'id': 'M4gD1WSo5mA',
889 'ext': 'mp4',
890 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
891 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
556dbe7f 892 'duration': 721,
7caf9830
S
893 'upload_date': '20150127',
894 'uploader_id': 'BerkmanCenter',
ec85ded8 895 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
556dbe7f 896 'uploader': 'The Berkman Klein Center for Internet & Society',
7caf9830
S
897 'license': 'Creative Commons Attribution license (reuse allowed)',
898 },
899 'params': {
900 'skip_download': True,
901 },
902 },
fd050249
S
903 {
904 # Channel-like uploader_url
905 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
906 'info_dict': {
907 'id': 'eQcmzGIKrzg',
908 'ext': 'mp4',
909 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
910 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
556dbe7f 911 'duration': 4060,
fd050249 912 'upload_date': '20151119',
eb6793ba 913 'uploader': 'Bernie Sanders',
fd050249 914 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
ec85ded8 915 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
fd050249
S
916 'license': 'Creative Commons Attribution license (reuse allowed)',
917 },
918 'params': {
919 'skip_download': True,
920 },
921 },
040ac686
S
922 {
923 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
924 'only_matching': True,
7f29cf54
S
925 },
926 {
067aa17e 927 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
7f29cf54
S
928 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
929 'only_matching': True,
6496ccb4
S
930 },
931 {
932 # Rental video preview
933 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
934 'info_dict': {
935 'id': 'uGpuVWrhIzE',
936 'ext': 'mp4',
937 'title': 'Piku - Trailer',
938 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
939 'upload_date': '20150811',
940 'uploader': 'FlixMatrix',
941 'uploader_id': 'FlixMatrixKaravan',
ec85ded8 942 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
6496ccb4
S
943 'license': 'Standard YouTube License',
944 },
945 'params': {
946 'skip_download': True,
947 },
eb6793ba 948 'skip': 'This video is not available.',
022a5d66 949 },
12afdc2a
S
950 {
951 # YouTube Red video with episode data
952 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
953 'info_dict': {
954 'id': 'iqKdEhx-dD4',
955 'ext': 'mp4',
956 'title': 'Isolation - Mind Field (Ep 1)',
4fe54c12 957 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
556dbe7f 958 'duration': 2085,
12afdc2a
S
959 'upload_date': '20170118',
960 'uploader': 'Vsauce',
961 'uploader_id': 'Vsauce',
962 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
12afdc2a
S
963 'series': 'Mind Field',
964 'season_number': 1,
965 'episode_number': 1,
966 },
967 'params': {
968 'skip_download': True,
969 },
970 'expected_warnings': [
971 'Skipping DASH manifest',
972 ],
973 },
c7121fa7
S
974 {
975 # The following content has been identified by the YouTube community
976 # as inappropriate or offensive to some audiences.
977 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
978 'info_dict': {
979 'id': '6SJNVb0GnPI',
980 'ext': 'mp4',
981 'title': 'Race Differences in Intelligence',
982 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
983 'duration': 965,
984 'upload_date': '20140124',
985 'uploader': 'New Century Foundation',
986 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
987 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
c7121fa7
S
988 },
989 'params': {
990 'skip_download': True,
991 },
992 },
022a5d66
S
993 {
994 # itag 212
995 'url': '1t24XAntNCY',
996 'only_matching': True,
fd5c4aab
S
997 },
998 {
999 # geo restricted to JP
1000 'url': 'sJL6WA-aGkQ',
1001 'only_matching': True,
1002 },
cd5a74a2
S
1003 {
1004 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1005 'only_matching': True,
1006 },
825cd268
RA
1007 {
1008 # DRM protected
1009 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1010 'only_matching': True,
4fe54c12
S
1011 },
1012 {
1013 # Video with unsupported adaptive stream type formats
1014 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1015 'info_dict': {
1016 'id': 'Z4Vy8R84T1U',
1017 'ext': 'mp4',
1018 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1019 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1020 'duration': 433,
1021 'upload_date': '20130923',
1022 'uploader': 'Amelia Putri Harwita',
1023 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1024 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1025 'formats': 'maxcount:10',
1026 },
1027 'params': {
1028 'skip_download': True,
1029 'youtube_include_dash_manifest': False,
1030 },
5429d6a9 1031 'skip': 'not actual anymore',
5caabd3c 1032 },
1033 {
822b9d9c 1034 # Youtube Music Auto-generated description
5caabd3c 1035 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1036 'info_dict': {
1037 'id': 'MgNrAu2pzNs',
1038 'ext': 'mp4',
1039 'title': 'Voyeur Girl',
1040 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1041 'upload_date': '20190312',
5429d6a9
S
1042 'uploader': 'Stephen - Topic',
1043 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
5caabd3c 1044 'artist': 'Stephen',
1045 'track': 'Voyeur Girl',
1046 'album': 'it\'s too much love to know my dear',
1047 'release_date': '20190313',
1048 'release_year': 2019,
1049 },
1050 'params': {
1051 'skip_download': True,
1052 },
1053 },
66b48727
RA
1054 {
1055 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1056 'only_matching': True,
1057 },
011e75e6
S
1058 {
1059 # invalid -> valid video id redirection
1060 'url': 'DJztXj2GPfl',
1061 'info_dict': {
1062 'id': 'DJztXj2GPfk',
1063 'ext': 'mp4',
1064 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1065 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1066 'upload_date': '20090125',
1067 'uploader': 'Prochorowka',
1068 'uploader_id': 'Prochorowka',
1069 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1070 'artist': 'Panjabi MC',
1071 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1072 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1073 },
1074 'params': {
1075 'skip_download': True,
1076 },
ea74e00b
DP
1077 },
1078 {
1079 # empty description results in an empty string
1080 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1081 'info_dict': {
1082 'id': 'x41yOUIvK2k',
1083 'ext': 'mp4',
1084 'title': 'IMG 3456',
1085 'description': '',
1086 'upload_date': '20170613',
1087 'uploader_id': 'ElevageOrVert',
1088 'uploader': 'ElevageOrVert',
1089 },
1090 'params': {
1091 'skip_download': True,
1092 },
1093 },
a0566bbf 1094 {
1095 # with '};' inside yt initial data (see https://github.com/ytdl-org/youtube-dl/issues/27093)
1096 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no',
1097 'info_dict': {
1098 'id': 'CHqg6qOn4no',
1099 'ext': 'mp4',
1100 'title': 'Part 77 Sort a list of simple types in c#',
1101 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc',
1102 'upload_date': '20130831',
1103 'uploader_id': 'kudvenkat',
1104 'uploader': 'kudvenkat',
1105 },
1106 'params': {
1107 'skip_download': True,
1108 },
1109 },
2eb88d95
PH
1110 ]
1111
e0df6211
PH
1112 def __init__(self, *args, **kwargs):
1113 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 1114 self._player_cache = {}
e0df6211 1115
c5e8d7af
PH
1116 def report_video_info_webpage_download(self, video_id):
1117 """Report attempt to download video info webpage."""
69ea8ca4 1118 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 1119
c5e8d7af
PH
1120 def report_information_extraction(self, video_id):
1121 """Report attempt to extract video information."""
69ea8ca4 1122 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
1123
1124 def report_unavailable_format(self, video_id, format):
1125 """Report extracted video URL."""
69ea8ca4 1126 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
1127
1128 def report_rtmp_download(self):
1129 """Indicate the download will use the RTMP protocol."""
69ea8ca4 1130 self.to_screen('RTMP download detected')
c5e8d7af 1131
60064c53
PH
1132 def _signature_cache_id(self, example_sig):
1133 """ Return a string representation of a signature """
78caa52a 1134 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53 1135
e40c758c
S
1136 @classmethod
1137 def _extract_player_info(cls, player_url):
1138 for player_re in cls._PLAYER_INFO_RE:
1139 id_m = re.search(player_re, player_url)
1140 if id_m:
1141 break
1142 else:
c081b35c 1143 raise ExtractorError('Cannot identify player %r' % player_url)
e40c758c
S
1144 return id_m.group('ext'), id_m.group('id')
1145
1146 def _extract_signature_function(self, video_id, player_url, example_sig):
1147 player_type, player_id = self._extract_player_info(player_url)
e0df6211 1148
c4417ddb 1149 # Read from filesystem cache
60064c53
PH
1150 func_id = '%s_%s_%s' % (
1151 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 1152 assert os.path.basename(func_id) == func_id
a0e07d31 1153
69ea8ca4 1154 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 1155 if cache_spec is not None:
78caa52a 1156 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 1157
6d1a55a5
PH
1158 download_note = (
1159 'Downloading player %s' % player_url
1160 if self._downloader.params.get('verbose') else
1161 'Downloading %s player %s' % (player_type, player_id)
1162 )
e0df6211
PH
1163 if player_type == 'js':
1164 code = self._download_webpage(
1165 player_url, video_id,
6d1a55a5 1166 note=download_note,
69ea8ca4 1167 errnote='Download of %s failed' % player_url)
83799698 1168 res = self._parse_sig_js(code)
c4417ddb 1169 elif player_type == 'swf':
e0df6211
PH
1170 urlh = self._request_webpage(
1171 player_url, video_id,
6d1a55a5 1172 note=download_note,
69ea8ca4 1173 errnote='Download of %s failed' % player_url)
e0df6211 1174 code = urlh.read()
83799698 1175 res = self._parse_sig_swf(code)
e0df6211
PH
1176 else:
1177 assert False, 'Invalid player type %r' % player_type
1178
785521bf
PH
1179 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1180 cache_res = res(test_string)
1181 cache_spec = [ord(c) for c in cache_res]
83799698 1182
69ea8ca4 1183 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
1184 return res
1185
60064c53 1186 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
1187 def gen_sig_code(idxs):
1188 def _genslice(start, end, step):
78caa52a 1189 starts = '' if start == 0 else str(start)
8bcc8756 1190 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 1191 steps = '' if step == 1 else (':%d' % step)
78caa52a 1192 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
1193
1194 step = None
7af808a5
PH
1195 # Quelch pyflakes warnings - start will be set when step is set
1196 start = '(Never used)'
edf3e38e
PH
1197 for i, prev in zip(idxs[1:], idxs[:-1]):
1198 if step is not None:
1199 if i - prev == step:
1200 continue
1201 yield _genslice(start, prev, step)
1202 step = None
1203 continue
1204 if i - prev in [-1, 1]:
1205 step = i - prev
1206 start = prev
1207 continue
1208 else:
78caa52a 1209 yield 's[%d]' % prev
edf3e38e 1210 if step is None:
78caa52a 1211 yield 's[%d]' % i
edf3e38e
PH
1212 else:
1213 yield _genslice(start, i, step)
1214
78caa52a 1215 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 1216 cache_res = func(test_string)
edf3e38e 1217 cache_spec = [ord(c) for c in cache_res]
78caa52a 1218 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
1219 signature_id_tuple = '(%s)' % (
1220 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 1221 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 1222 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 1223 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 1224
e0df6211
PH
1225 def _parse_sig_js(self, jscode):
1226 funcname = self._search_regex(
abefc03f
S
1227 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1228 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
e450f6cb 1229 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
31ce6e99 1230 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
abefc03f
S
1231 # Obsolete patterns
1232 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
9a47fa35 1233 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
abefc03f
S
1234 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1235 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1236 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1237 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1238 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1239 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
3c90cc8b 1240 jscode, 'Initial JS player signature function name', group='sig')
2b25cb5d
PH
1241
1242 jsi = JSInterpreter(jscode)
1243 initial_function = jsi.extract_function(funcname)
e0df6211
PH
1244 return lambda s: initial_function([s])
1245
1246 def _parse_sig_swf(self, file_contents):
54256267 1247 swfi = SWFInterpreter(file_contents)
78caa52a 1248 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 1249 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 1250 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
1251 return lambda s: initial_function([s])
1252
83799698 1253 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 1254 """Turn the encrypted s field into a working signature"""
6b37f0be 1255
c8bf86d5 1256 if player_url is None:
69ea8ca4 1257 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 1258
69ea8ca4 1259 if player_url.startswith('//'):
78caa52a 1260 player_url = 'https:' + player_url
3c90cc8b
S
1261 elif not re.match(r'https?://', player_url):
1262 player_url = compat_urlparse.urljoin(
1263 'https://www.youtube.com', player_url)
c8bf86d5 1264 try:
62af3a0e 1265 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
1266 if player_id not in self._player_cache:
1267 func = self._extract_signature_function(
60064c53 1268 video_id, player_url, s
c8bf86d5
PH
1269 )
1270 self._player_cache[player_id] = func
1271 func = self._player_cache[player_id]
1272 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 1273 self._print_sig_code(func, s)
c8bf86d5
PH
1274 return func(s)
1275 except Exception as e:
1276 tb = traceback.format_exc()
1277 raise ExtractorError(
78caa52a 1278 'Signature extraction failed: ' + tb, cause=e)
e0df6211 1279
f96f5dda 1280 def _get_subtitles(self, video_id, webpage, has_live_chat_replay):
de7f3446 1281 try:
60e47a26 1282 subs_doc = self._download_xml(
38c2e5b8 1283 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
1284 video_id, note=False)
1285 except ExtractorError as err:
9b9c5355 1286 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
de7f3446 1287 return {}
de7f3446
JMF
1288
1289 sub_lang_list = {}
60e47a26
JMF
1290 for track in subs_doc.findall('track'):
1291 lang = track.attrib['lang_code']
7e660ac1
LD
1292 if lang in sub_lang_list:
1293 continue
360e1ca5 1294 sub_formats = []
23d17e4b 1295 for ext in self._SUBTITLE_FORMATS:
15707c7e 1296 params = compat_urllib_parse_urlencode({
360e1ca5
JMF
1297 'lang': lang,
1298 'v': video_id,
1299 'fmt': ext,
1300 'name': track.attrib['name'].encode('utf-8'),
1301 })
1302 sub_formats.append({
1303 'url': 'https://www.youtube.com/api/timedtext?' + params,
1304 'ext': ext,
1305 })
1306 sub_lang_list[lang] = sub_formats
9f448fcb 1307 if has_live_chat_replay:
321bf820 1308 sub_lang_list['live_chat'] = [
1309 {
1310 'video_id': video_id,
1311 'ext': 'json',
1312 'protocol': 'youtube_live_chat_replay',
1313 },
9f448fcb 1314 ]
de7f3446 1315 if not sub_lang_list:
69ea8ca4 1316 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
1317 return {}
1318 return sub_lang_list
1319
a72778d3
S
1320 def _get_ytplayer_config(self, video_id, webpage):
1321 patterns = (
526b3b07
S
1322 # User data may contain arbitrary character sequences that may affect
1323 # JSON extraction with regex, e.g. when '};' is contained the second
1324 # regex won't capture the whole JSON. Yet working around by trying more
1325 # concrete regex first keeping in mind proper quoted string handling
1326 # to be implemented in future that will replace this workaround (see
067aa17e
S
1327 # https://github.com/ytdl-org/youtube-dl/issues/7468,
1328 # https://github.com/ytdl-org/youtube-dl/pull/7599)
a72778d3
S
1329 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1330 r';ytplayer\.config\s*=\s*({.+?});',
8bdd16b4 1331 r'ytInitialPlayerResponse\s*=\s*({.+?});var meta' # Needed???
a72778d3
S
1332 )
1333 config = self._search_regex(
1334 patterns, webpage, 'ytplayer.config', default=None)
1335 if config:
1336 return self._parse_json(
1337 uppercase_escape(config), video_id, fatal=False)
0e49d9a6 1338
9322f116 1339 def _get_music_metadata_from_yt_initial(self, yt_initial):
1340 music_metadata = []
1341 key_map = {
1342 'Album': 'album',
1343 'Artist': 'artist',
1344 'Song': 'track'
1345 }
1346 contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'])
1347 if type(contents) is list:
1348 for content in contents:
1349 music_track = {}
1350 if type(content) is not dict:
1351 continue
1352 videoSecondaryInfoRenderer = try_get(content, lambda x: x['videoSecondaryInfoRenderer'])
1353 if type(videoSecondaryInfoRenderer) is not dict:
1354 continue
1355 rows = try_get(videoSecondaryInfoRenderer, lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'])
1356 if type(rows) is not list:
1357 continue
1358 for row in rows:
1359 metadataRowRenderer = try_get(row, lambda x: x['metadataRowRenderer'])
1360 if type(metadataRowRenderer) is not dict:
1361 continue
1362 key = try_get(metadataRowRenderer, lambda x: x['title']['simpleText'])
1363 value = try_get(metadataRowRenderer, lambda x: x['contents'][0]['simpleText']) or \
1364 try_get(metadataRowRenderer, lambda x: x['contents'][0]['runs'][0]['text'])
1365 if type(key) is not str or type(value) is not str:
1366 continue
1367 if key in key_map:
1368 if key_map[key] in music_track:
1369 # we've started on a new track
1370 music_metadata.append(music_track)
1371 music_track = {}
1372 music_track[key_map[key]] = value
1373 if len(music_track.keys()):
1374 music_metadata.append(music_track)
1375 return music_metadata
1376
360e1ca5 1377 def _get_automatic_captions(self, video_id, webpage):
de7f3446
JMF
1378 """We need the webpage for getting the captions url, pass it as an
1379 argument to speed up the process."""
69ea8ca4 1380 self.to_screen('%s: Looking for automatic captions' % video_id)
a72778d3 1381 player_config = self._get_ytplayer_config(video_id, webpage)
78caa52a 1382 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
a72778d3 1383 if not player_config:
de7f3446
JMF
1384 self._downloader.report_warning(err_msg)
1385 return {}
de7f3446 1386 try:
8bdd16b4 1387 args = player_config['args']
1388 caption_url = args.get('ttsurl')
1389 if caption_url:
b78b292f
S
1390 timestamp = args['timestamp']
1391 # We get the available subtitles
15707c7e 1392 list_params = compat_urllib_parse_urlencode({
b78b292f
S
1393 'type': 'list',
1394 'tlangs': 1,
1395 'asrs': 1,
1396 })
1397 list_url = caption_url + '&' + list_params
1398 caption_list = self._download_xml(list_url, video_id)
1399 original_lang_node = caption_list.find('track')
1400 if original_lang_node is None:
1401 self._downloader.report_warning('Video doesn\'t have automatic captions')
1402 return {}
1403 original_lang = original_lang_node.attrib['lang_code']
1404 caption_kind = original_lang_node.attrib.get('kind', '')
1405
1406 sub_lang_list = {}
1407 for lang_node in caption_list.findall('target'):
1408 sub_lang = lang_node.attrib['lang_code']
1409 sub_formats = []
1410 for ext in self._SUBTITLE_FORMATS:
15707c7e 1411 params = compat_urllib_parse_urlencode({
b78b292f
S
1412 'lang': original_lang,
1413 'tlang': sub_lang,
1414 'fmt': ext,
1415 'ts': timestamp,
1416 'kind': caption_kind,
1417 })
1418 sub_formats.append({
1419 'url': caption_url + '&' + params,
1420 'ext': ext,
1421 })
1422 sub_lang_list[sub_lang] = sub_formats
1423 return sub_lang_list
1424
ddbb4c5c
S
1425 def make_captions(sub_url, sub_langs):
1426 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1427 caption_qs = compat_parse_qs(parsed_sub_url.query)
1428 captions = {}
1429 for sub_lang in sub_langs:
1430 sub_formats = []
1431 for ext in self._SUBTITLE_FORMATS:
1432 caption_qs.update({
1433 'tlang': [sub_lang],
1434 'fmt': [ext],
1435 })
1436 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1437 query=compat_urllib_parse_urlencode(caption_qs, True)))
1438 sub_formats.append({
1439 'url': sub_url,
1440 'ext': ext,
1441 })
1442 captions[sub_lang] = sub_formats
1443 return captions
1444
1445 # New captions format as of 22.06.2017
8bdd16b4 1446 player_response = args.get('player_response')
1447 if player_response and isinstance(player_response, compat_str):
1448 player_response = self._parse_json(
1449 player_response, video_id, fatal=False)
1450 if player_response:
1451 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1452 base_url = renderer['captionTracks'][0]['baseUrl']
59c5fa91
PO
1453 sub_lang_list = []
1454 for lang in renderer['translationLanguages']:
1455 lang_code = lang.get('languageCode')
1456 if lang_code:
1457 sub_lang_list.append(lang_code)
1458 return make_captions(base_url, sub_lang_list)
1459
8bdd16b4 1460 # Some videos don't provide ttsurl but rather caption_tracks and
1461 # caption_translation_languages (e.g. 20LmZk1hakA)
1462 # Does not used anymore as of 22.06.2017
1463 caption_tracks = args['caption_tracks']
1464 caption_translation_languages = args['caption_translation_languages']
1465 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
1466 sub_lang_list = []
1467 for lang in caption_translation_languages.split(','):
1468 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1469 sub_lang = lang_qs.get('lc', [None])[0]
1470 if sub_lang:
1471 sub_lang_list.append(sub_lang)
1472 return make_captions(caption_url, sub_lang_list)
de7f3446
JMF
1473 # An extractor error can be raise by the download process if there are
1474 # no automatic captions but there are subtitles
ddbb4c5c 1475 except (KeyError, IndexError, ExtractorError):
de7f3446
JMF
1476 self._downloader.report_warning(err_msg)
1477 return {}
1478
21c340b8
S
1479 def _mark_watched(self, video_id, video_info, player_response):
1480 playback_url = url_or_none(try_get(
1481 player_response,
1482 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
1483 video_info, lambda x: x['videostats_playback_base_url'][0]))
d77ab8e2
S
1484 if not playback_url:
1485 return
1486 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1487 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1488
1489 # cpn generation algorithm is reverse engineered from base.js.
1490 # In fact it works even with dummy cpn.
1491 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1492 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1493
1494 qs.update({
1495 'ver': ['2'],
1496 'cpn': [cpn],
1497 })
1498 playback_url = compat_urlparse.urlunparse(
15707c7e 1499 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
d77ab8e2
S
1500
1501 self._download_webpage(
1502 playback_url, video_id, 'Marking watched',
1503 'Unable to mark watched', fatal=False)
1504
66c9fa36
S
1505 @staticmethod
1506 def _extract_urls(webpage):
1507 # Embedded YouTube player
1508 entries = [
1509 unescapeHTML(mobj.group('url'))
1510 for mobj in re.finditer(r'''(?x)
1511 (?:
1512 <iframe[^>]+?src=|
1513 data-video-url=|
1514 <embed[^>]+?src=|
1515 embedSWF\(?:\s*|
1516 <object[^>]+data=|
1517 new\s+SWFObject\(
1518 )
1519 (["\'])
1520 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
f2332f18 1521 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
66c9fa36
S
1522 \1''', webpage)]
1523
1524 # lazyYT YouTube embed
1525 entries.extend(list(map(
1526 unescapeHTML,
1527 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1528
1529 # Wordpress "YouTube Video Importer" plugin
1530 matches = re.findall(r'''(?x)<div[^>]+
1531 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1532 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1533 entries.extend(m[-1] for m in matches)
1534
1535 return entries
1536
1537 @staticmethod
1538 def _extract_url(webpage):
1539 urls = YoutubeIE._extract_urls(webpage)
1540 return urls[0] if urls else None
1541
97665381
PH
1542 @classmethod
1543 def extract_id(cls, url):
1544 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 1545 if mobj is None:
69ea8ca4 1546 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
1547 video_id = mobj.group(2)
1548 return video_id
1549
84213ea8
S
1550 def _extract_chapters_from_json(self, webpage, video_id, duration):
1551 if not webpage:
1552 return
8bdd16b4 1553 data = self._extract_yt_initial_data(video_id, webpage)
1554 if not data or not isinstance(data, dict):
84213ea8
S
1555 return
1556 chapters_list = try_get(
8bdd16b4 1557 data,
84213ea8
S
1558 lambda x: x['playerOverlays']
1559 ['playerOverlayRenderer']
1560 ['decoratedPlayerBarRenderer']
1561 ['decoratedPlayerBarRenderer']
1562 ['playerBar']
1563 ['chapteredPlayerBarRenderer']
1564 ['chapters'],
1565 list)
1566 if not chapters_list:
1567 return
1568
1569 def chapter_time(chapter):
1570 return float_or_none(
1571 try_get(
1572 chapter,
1573 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1574 int),
1575 scale=1000)
1576 chapters = []
1577 for next_num, chapter in enumerate(chapters_list, start=1):
1578 start_time = chapter_time(chapter)
1579 if start_time is None:
1580 continue
1581 end_time = (chapter_time(chapters_list[next_num])
1582 if next_num < len(chapters_list) else duration)
1583 if end_time is None:
1584 continue
1585 title = try_get(
1586 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1587 compat_str)
1588 chapters.append({
1589 'start_time': start_time,
1590 'end_time': end_time,
1591 'title': title,
1592 })
1593 return chapters
1594
9cafc3fd 1595 @staticmethod
84213ea8 1596 def _extract_chapters_from_description(description, duration):
9cafc3fd
S
1597 if not description:
1598 return None
1599 chapter_lines = re.findall(
1600 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1601 description)
1602 if not chapter_lines:
1603 return None
1604 chapters = []
1605 for next_num, (chapter_line, time_point) in enumerate(
1606 chapter_lines, start=1):
1607 start_time = parse_duration(time_point)
1608 if start_time is None:
1609 continue
39d4c1be
S
1610 if start_time > duration:
1611 break
9cafc3fd
S
1612 end_time = (duration if next_num == len(chapter_lines)
1613 else parse_duration(chapter_lines[next_num][1]))
1614 if end_time is None:
1615 continue
39d4c1be
S
1616 if end_time > duration:
1617 end_time = duration
1618 if start_time > end_time:
1619 break
9cafc3fd
S
1620 chapter_title = re.sub(
1621 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1622 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1623 chapters.append({
1624 'start_time': start_time,
1625 'end_time': end_time,
1626 'title': chapter_title,
1627 })
1628 return chapters
1629
84213ea8
S
1630 def _extract_chapters(self, webpage, description, video_id, duration):
1631 return (self._extract_chapters_from_json(webpage, video_id, duration)
1632 or self._extract_chapters_from_description(description, duration))
1633
c5e8d7af 1634 def _real_extract(self, url):
cf7e015f
S
1635 url, smuggled_data = unsmuggle_url(url, {})
1636
7e8c0af0 1637 proto = (
78caa52a
PH
1638 'http' if self._downloader.params.get('prefer_insecure', False)
1639 else 'https')
7e8c0af0 1640
7c80519c 1641 start_time = None
297a564b 1642 end_time = None
7c80519c
JMF
1643 parsed_url = compat_urllib_parse_urlparse(url)
1644 for component in [parsed_url.fragment, parsed_url.query]:
1645 query = compat_parse_qs(component)
297a564b 1646 if start_time is None and 't' in query:
7c80519c 1647 start_time = parse_duration(query['t'][0])
2929fa0e
JMF
1648 if start_time is None and 'start' in query:
1649 start_time = parse_duration(query['start'][0])
297a564b
JMF
1650 if end_time is None and 'end' in query:
1651 end_time = parse_duration(query['end'][0])
7c80519c 1652
c5e8d7af
PH
1653 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1654 mobj = re.search(self._NEXT_URL_RE, url)
1655 if mobj:
7fd002c0 1656 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
97665381 1657 video_id = self.extract_id(url)
c5e8d7af
PH
1658
1659 # Get video webpage
aa79ac0c 1660 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
011e75e6
S
1661 video_webpage, urlh = self._download_webpage_handle(url, video_id)
1662
1663 qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query)
1664 video_id = qs.get('v', [None])[0] or video_id
c5e8d7af
PH
1665
1666 # Attempt to extract SWF player URL
e0df6211 1667 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1668 if mobj is not None:
1669 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1670 else:
1671 player_url = None
1672
d8d24a92
S
1673 dash_mpds = []
1674
1675 def add_dash_mpd(video_info):
1676 dash_mpd = video_info.get('dashmpd')
1677 if dash_mpd and dash_mpd[0] not in dash_mpds:
1678 dash_mpds.append(dash_mpd[0])
1679
561b456e
S
1680 def add_dash_mpd_pr(pl_response):
1681 dash_mpd = url_or_none(try_get(
1682 pl_response, lambda x: x['streamingData']['dashManifestUrl'],
1683 compat_str))
1684 if dash_mpd and dash_mpd not in dash_mpds:
1685 dash_mpds.append(dash_mpd)
1686
c7121fa7
S
1687 is_live = None
1688 view_count = None
1689
1690 def extract_view_count(v_info):
1691 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1692
c2d125d9
S
1693 def extract_player_response(player_response, video_id):
1694 pl_response = str_or_none(player_response)
1695 if not pl_response:
1696 return
1697 pl_response = self._parse_json(pl_response, video_id, fatal=False)
1698 if isinstance(pl_response, dict):
1699 add_dash_mpd_pr(pl_response)
1700 return pl_response
1701
fb2c9277
U
1702 def extract_embedded_config(embed_webpage, video_id):
1703 embedded_config = self._search_regex(
1704 r'setConfig\(({.*})\);',
1705 embed_webpage, 'ytInitialData', default=None)
1706 if embedded_config:
1707 return embedded_config
1708
dbdaaa23
S
1709 player_response = {}
1710
c5e8d7af 1711 # Get video info
43ebf77d 1712 video_info = {}
6449cd80 1713 embed_webpage = None
39e7107d
U
1714 if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+'
1715 or re.search(r'player-age-gate-content">', video_webpage) is not None):
9d9314cb 1716 cookie_keys = self._get_cookies('https://www.youtube.com').keys()
c108eb73
JMF
1717 age_gate = True
1718 # We simulate the access to the video from www.youtube.com/v/{video_id}
1719 # this can be viewed without login into Youtube
beb95e77
CL
1720 url = proto + '://www.youtube.com/embed/%s' % video_id
1721 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
fb2c9277
U
1722 ext = extract_embedded_config(embed_webpage, video_id)
1723 # playabilityStatus = re.search(r'{\\\"status\\\":\\\"(?P<playabilityStatus>[^\"]+)\\\"', ext)
1724 playable_in_embed = re.search(r'{\\\"playableInEmbed\\\":(?P<playableinEmbed>[^\,]+)', ext)
1725 if not playable_in_embed:
1726 self.to_screen('Could not determine whether playabale in embed for video %s' % video_id)
1727 playable_in_embed = ''
1728 else:
1729 playable_in_embed = playable_in_embed.group('playableinEmbed')
1730 # check if video is only playable on youtube in other words not playable in embed - if so it requires auth (cookies)
1731 # if re.search(r'player-unavailable">', embed_webpage) is not None:
1732 if playable_in_embed == 'false':
c73baf23
U
1733 '''
1734 # TODO apply this patch when Support for Python 2.6(!) and above drops
9d9314cb 1735 if ({'VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID'} <= cookie_keys
4bb9c880 1736 or {'VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO'} <= cookie_keys):
c73baf23
U
1737 '''
1738 if (set(('VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID')) <= set(cookie_keys)
1739 or set(('VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO')) <= set(cookie_keys)):
4bb9c880
U
1740 age_gate = False
1741 # Try looking directly into the video webpage
1742 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1743 if ytplayer_config:
59c5fa91
PO
1744 args = ytplayer_config.get("args")
1745 if args is not None:
1746 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1747 # Convert to the same format returned by compat_parse_qs
1748 video_info = dict((k, [v]) for k, v in args.items())
1749 add_dash_mpd(video_info)
1750 # Rental video is not rented but preview is available (e.g.
1751 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1752 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1753 if not video_info and args.get('ypc_vid'):
1754 return self.url_result(
1755 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1756 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1757 is_live = True
1758 if not player_response:
1759 player_response = extract_player_response(args.get('player_response'), video_id)
1760 elif not player_response:
1761 player_response = ytplayer_config
4bb9c880
U
1762 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1763 add_dash_mpd_pr(player_response)
9d9314cb
U
1764 else:
1765 raise ExtractorError('Video is age restricted and only playable on Youtube. Requires cookies!', expected=True)
1766 else:
1767 data = compat_urllib_parse_urlencode({
1768 'video_id': video_id,
1769 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1770 'sts': self._search_regex(
1771 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1772 })
1773 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1774 try:
1775 video_info_webpage = self._download_webpage(
1776 video_info_url, video_id,
1777 note='Refetching age-gated info webpage',
1778 errnote='unable to download video info webpage')
1779 except ExtractorError:
1780 video_info_webpage = None
1781 if video_info_webpage:
1782 video_info = compat_parse_qs(video_info_webpage)
1783 pl_response = video_info.get('player_response', [None])[0]
1784 player_response = extract_player_response(pl_response, video_id)
1785 add_dash_mpd(video_info)
1786 view_count = extract_view_count(video_info)
c108eb73
JMF
1787 else:
1788 age_gate = False
d8d24a92 1789 # Try looking directly into the video webpage
a72778d3 1790 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
8bdd16b4 1791 if ytplayer_config:
1792 args = ytplayer_config.get('args', {})
4c76aa06 1793 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
d8d24a92
S
1794 # Convert to the same format returned by compat_parse_qs
1795 video_info = dict((k, [v]) for k, v in args.items())
1796 add_dash_mpd(video_info)
6496ccb4
S
1797 # Rental video is not rented but preview is available (e.g.
1798 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
067aa17e 1799 # https://github.com/ytdl-org/youtube-dl/issues/10532)
6496ccb4
S
1800 if not video_info and args.get('ypc_vid'):
1801 return self.url_result(
1802 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
2fe1ff85
JMF
1803 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1804 is_live = True
dbdaaa23 1805 if not player_response:
c2d125d9 1806 player_response = extract_player_response(args.get('player_response'), video_id)
0a3cf9ad 1807 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
561b456e 1808 add_dash_mpd_pr(player_response)
bbb7c3f7 1809
8bdd16b4 1810 if not video_info and not player_response:
1811 player_response = extract_player_response(
1812 self._search_regex(
1813 r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;', video_webpage,
1814 'initial player response', default='{}'),
1815 video_id)
1816
bbb7c3f7 1817 def extract_unavailable_message():
0add33ab
S
1818 messages = []
1819 for tag, kind in (('h1', 'message'), ('div', 'submessage')):
1820 msg = self._html_search_regex(
1821 r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
1822 video_webpage, 'unavailable %s' % kind, default=None)
1823 if msg:
1824 messages.append(msg)
1825 if messages:
1826 return '\n'.join(messages)
bbb7c3f7 1827
f93abcf1 1828 if not video_info and not player_response:
15be3eb5
RA
1829 unavailable_message = extract_unavailable_message()
1830 if not unavailable_message:
1831 unavailable_message = 'Unable to extract video data'
1832 raise ExtractorError(
1833 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
1834
f93abcf1
S
1835 if not isinstance(video_info, dict):
1836 video_info = {}
1837
dbdaaa23
S
1838 video_details = try_get(
1839 player_response, lambda x: x['videoDetails'], dict) or {}
1840
37357d21
S
1841 microformat = try_get(
1842 player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}
1843
8dbf751a
RA
1844 video_title = video_info.get('title', [None])[0] or video_details.get('title')
1845 if not video_title:
cf7e015f
S
1846 self._downloader.report_warning('Unable to extract video title')
1847 video_title = '_'
1848
9cafc3fd 1849 description_original = video_description = get_element_by_id("eow-description", video_webpage)
cf7e015f 1850 if video_description:
fa4bc6e7
RA
1851
1852 def replace_url(m):
1853 redir_url = compat_urlparse.urljoin(url, m.group(1))
1854 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1855 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1856 qs = compat_parse_qs(parsed_redir_url.query)
1857 q = qs.get('q')
1858 if q and q[0]:
1859 return q[0]
1860 return redir_url
1861
9cafc3fd 1862 description_original = video_description = re.sub(r'''(?x)
cf7e015f 1863 <a\s+
25cb7a0e 1864 (?:[a-zA-Z-]+="[^"]*"\s+)*?
23f13e97 1865 (?:title|href)="([^"]+)"\s+
25cb7a0e 1866 (?:[a-zA-Z-]+="[^"]*"\s+)*?
525cedb9 1867 class="[^"]*"[^>]*>
23f13e97 1868 [^<]+\.{3}\s*
cf7e015f 1869 </a>
fa4bc6e7 1870 ''', replace_url, video_description)
cf7e015f
S
1871 video_description = clean_html(video_description)
1872 else:
ea74e00b
DP
1873 video_description = video_details.get('shortDescription')
1874 if video_description is None:
1875 video_description = self._html_search_meta('description', video_webpage)
cf7e015f 1876
8fe10494 1877 if not smuggled_data.get('force_singlefeed', False):
5e1eddb9 1878 if not self._downloader.params.get('noplaylist'):
8fe10494
S
1879 multifeed_metadata_list = try_get(
1880 player_response,
1881 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
1882 compat_str) or try_get(
1883 video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
1884 if multifeed_metadata_list:
1885 entries = []
1886 feed_ids = []
1887 for feed in multifeed_metadata_list.split(','):
1888 # Unquote should take place before split on comma (,) since textual
1889 # fields may contain comma as well (see
067aa17e 1890 # https://github.com/ytdl-org/youtube-dl/issues/8536)
8fe10494 1891 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
6b09401b
S
1892
1893 def feed_entry(name):
1894 return try_get(feed_data, lambda x: x[name][0], compat_str)
1895
1896 feed_id = feed_entry('id')
1897 if not feed_id:
1898 continue
1899 feed_title = feed_entry('title')
1900 title = video_title
1901 if feed_title:
1902 title += ' (%s)' % feed_title
8fe10494
S
1903 entries.append({
1904 '_type': 'url_transparent',
1905 'ie_key': 'Youtube',
1906 'url': smuggle_url(
1907 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1908 {'force_singlefeed': True}),
6b09401b 1909 'title': title,
8fe10494 1910 })
6b09401b 1911 feed_ids.append(feed_id)
8fe10494
S
1912 self.to_screen(
1913 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1914 % (', '.join(feed_ids), video_id))
1915 return self.playlist_result(entries, video_id, video_title, video_description)
1916 else:
1917 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
cf7e015f 1918
c7121fa7 1919 if view_count is None:
1c9c8de2 1920 view_count = extract_view_count(video_info)
dbdaaa23
S
1921 if view_count is None and video_details:
1922 view_count = int_or_none(video_details.get('viewCount'))
7b16239a
S
1923 if view_count is None and microformat:
1924 view_count = int_or_none(microformat.get('viewCount'))
1d699755 1925
27019dbb 1926 if is_live is None:
898238e9 1927 is_live = bool_or_none(video_details.get('isLive'))
27019dbb 1928
321bf820 1929 has_live_chat_replay = False
f0f76a33 1930 if not is_live:
321bf820 1931 yt_initial_data = self._get_yt_initial_data(video_id, video_webpage)
1932 try:
1933 yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
1934 has_live_chat_replay = True
f0f76a33 1935 except (KeyError, IndexError, TypeError):
321bf820 1936 pass
1937
c5e8d7af
PH
1938 # Check for "rental" videos
1939 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
067aa17e 1940 raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
c5e8d7af 1941
c63ca0ee
S
1942 def _extract_filesize(media_url):
1943 return int_or_none(self._search_regex(
1944 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
1945
bf1317d2
S
1946 streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
1947 streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
1948
c5e8d7af
PH
1949 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1950 self.report_rtmp_download()
dd27fd17
PH
1951 formats = [{
1952 'format_id': '_rtmp',
1953 'protocol': 'rtmp',
1954 'url': video_info['conn'][0],
1955 'player_url': player_url,
1956 }]
bf1317d2 1957 elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
5f6a1245 1958 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 1959 if 'rtmpe%3Dyes' in encoded_url_map:
067aa17e 1960 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
bf1317d2 1961 formats = []
3318832e 1962 formats_spec = {}
82156fdb 1963 fmt_list = video_info.get('fmt_list', [''])[0]
1964 if fmt_list:
1965 for fmt in fmt_list.split(','):
1966 spec = fmt.split('/')
3318832e 1967 if len(spec) > 1:
1968 width_height = spec[1].split('x')
1969 if len(width_height) == 2:
1970 formats_spec[spec[0]] = {
1971 'resolution': spec[1],
1972 'width': int_or_none(width_height[0]),
1973 'height': int_or_none(width_height[1]),
1974 }
bf1317d2
S
1975 for fmt in streaming_formats:
1976 itag = str_or_none(fmt.get('itag'))
1977 if not itag:
201e9eaa 1978 continue
bf1317d2
S
1979 quality = fmt.get('quality')
1980 quality_label = fmt.get('qualityLabel') or quality
1981 formats_spec[itag] = {
1982 'asr': int_or_none(fmt.get('audioSampleRate')),
1983 'filesize': int_or_none(fmt.get('contentLength')),
1984 'format_note': quality_label,
1985 'fps': int_or_none(fmt.get('fps')),
1986 'height': int_or_none(fmt.get('height')),
bf1317d2
S
1987 # bitrate for itag 43 is always 2147483647
1988 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
1989 'width': int_or_none(fmt.get('width')),
1990 }
1991
1992 for fmt in streaming_formats:
00eb865b 1993 if fmt.get('drmFamilies') or fmt.get('drm_families'):
bf1317d2
S
1994 continue
1995 url = url_or_none(fmt.get('url'))
1996
1997 if not url:
fa3db383 1998 cipher = fmt.get('cipher') or fmt.get('signatureCipher')
bf1317d2
S
1999 if not cipher:
2000 continue
2001 url_data = compat_parse_qs(cipher)
2002 url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
2003 if not url:
2004 continue
2005 else:
2006 cipher = None
2007 url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2008
2f483bc1
S
2009 stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
2010 # Unsupported FORMAT_STREAM_TYPE_OTF
2011 if stream_type == 3:
2012 continue
6449cd80 2013
bf1317d2
S
2014 format_id = fmt.get('itag') or url_data['itag'][0]
2015 if not format_id:
2016 continue
2017 format_id = compat_str(format_id)
a49eccdf 2018
bf1317d2
S
2019 if cipher:
2020 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
8bdd16b4 2021 ASSETS_RE = (
2022 r'<script[^>]+\bsrc=("[^"]+")[^>]+\bname=["\']player_ias/base',
2023 r'"jsUrl"\s*:\s*("[^"]+")',
2024 r'"assets":.+?"js":\s*("[^"]+")')
bf1317d2
S
2025 jsplayer_url_json = self._search_regex(
2026 ASSETS_RE,
2027 embed_webpage if age_gate else video_webpage,
2028 'JS player URL (1)', default=None)
2029 if not jsplayer_url_json and not age_gate:
2030 # We need the embed website after all
2031 if embed_webpage is None:
2032 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
2033 embed_webpage = self._download_webpage(
2034 embed_url, video_id, 'Downloading embed webpage')
2035 jsplayer_url_json = self._search_regex(
2036 ASSETS_RE, embed_webpage, 'JS player URL')
2037
2038 player_url = json.loads(jsplayer_url_json)
cf010131 2039 if player_url is None:
bf1317d2
S
2040 player_url_json = self._search_regex(
2041 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
2042 video_webpage, 'age gate player URL')
2043 player_url = json.loads(player_url_json)
2044
2045 if 'sig' in url_data:
2046 url += '&signature=' + url_data['sig'][0]
2047 elif 's' in url_data:
2048 encrypted_sig = url_data['s'][0]
2049
2050 if self._downloader.params.get('verbose'):
2051 if player_url is None:
bf1317d2 2052 player_desc = 'unknown'
cf010131 2053 else:
e40c758c
S
2054 player_type, player_version = self._extract_player_info(player_url)
2055 player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version)
bf1317d2
S
2056 parts_sizes = self._signature_cache_id(encrypted_sig)
2057 self.to_screen('{%s} signature length %s, %s' %
2058 (format_id, parts_sizes, player_desc))
2059
2060 signature = self._decrypt_signature(
2061 encrypted_sig, video_id, player_url, age_gate)
2062 sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
2063 url += '&%s=%s' % (sp, signature)
201e9eaa
PH
2064 if 'ratebypass' not in url:
2065 url += '&ratebypass=yes'
c9afb51c 2066
94278f72
YCH
2067 dct = {
2068 'format_id': format_id,
2069 'url': url,
2070 'player_url': player_url,
2071 }
2072 if format_id in self._formats:
2073 dct.update(self._formats[format_id])
3318832e 2074 if format_id in formats_spec:
2075 dct.update(formats_spec[format_id])
94278f72 2076
aabc2be6 2077 # Some itags are not included in DASH manifest thus corresponding formats will
067aa17e 2078 # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).
aabc2be6
S
2079 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
2080 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
2081 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
94278f72 2082
bf1317d2
S
2083 if width is None:
2084 width = int_or_none(fmt.get('width'))
2085 if height is None:
2086 height = int_or_none(fmt.get('height'))
2087
c63ca0ee
S
2088 filesize = int_or_none(url_data.get(
2089 'clen', [None])[0]) or _extract_filesize(url)
2090
bf1317d2
S
2091 quality = url_data.get('quality', [None])[0] or fmt.get('quality')
2092 quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
2093
4878759f
S
2094 tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
2095 or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
bf1317d2 2096 fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
54fc90aa 2097
94278f72 2098 more_fields = {
c63ca0ee 2099 'filesize': filesize,
bf1317d2 2100 'tbr': tbr,
c9afb51c
AH
2101 'width': width,
2102 'height': height,
bf1317d2
S
2103 'fps': fps,
2104 'format_note': quality_label or quality,
c9afb51c 2105 }
94278f72
YCH
2106 for key, value in more_fields.items():
2107 if value:
2108 dct[key] = value
bf1317d2 2109 type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
aabc2be6
S
2110 if type_:
2111 type_split = type_.split(';')
2112 kind_ext = type_split[0].split('/')
2113 if len(kind_ext) == 2:
94278f72
YCH
2114 kind, _ = kind_ext
2115 dct['ext'] = mimetype2ext(type_split[0])
aabc2be6
S
2116 if kind in ('audio', 'video'):
2117 codecs = None
2118 for mobj in re.finditer(
2119 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
2120 if mobj.group('key') == 'codecs':
2121 codecs = mobj.group('val')
2122 break
2123 if codecs:
6310acf5 2124 dct.update(parse_codecs(codecs))
e4a60912
S
2125 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
2126 dct['downloader_options'] = {
2127 # Youtube throttles chunks >~10M
2128 'http_chunk_size': 10485760,
2129 }
aabc2be6 2130 formats.append(dct)
c5e8d7af 2131 else:
c3e54389
S
2132 manifest_url = (
2133 url_or_none(try_get(
2134 player_response,
2135 lambda x: x['streamingData']['hlsManifestUrl'],
3089bc74
S
2136 compat_str))
2137 or url_or_none(try_get(
c3e54389
S
2138 video_info, lambda x: x['hlsvp'][0], compat_str)))
2139 if manifest_url:
2140 formats = []
2141 m3u8_formats = self._extract_m3u8_formats(
2142 manifest_url, video_id, 'mp4', fatal=False)
2143 for a_format in m3u8_formats:
2144 itag = self._search_regex(
2145 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
2146 if itag:
2147 a_format['format_id'] = itag
2148 if itag in self._formats:
2149 dct = self._formats[itag].copy()
2150 dct.update(a_format)
2151 a_format = dct
2152 a_format['player_url'] = player_url
2153 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
2154 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
78895bd3
U
2155 if self._downloader.params.get('youtube_include_hls_manifest', True):
2156 formats.append(a_format)
c3e54389 2157 else:
13577349 2158 error_message = extract_unavailable_message()
a0566bbf 2159 if not error_message:
2160 reason_list = try_get(
2161 player_response,
2162 lambda x: x['playabilityStatus']['errorScreen']['playerErrorMessageRenderer']['subreason']['runs'],
2163 list) or []
2164 for reason in reason_list:
2165 if not isinstance(reason, dict):
2166 continue
2167 reason_text = try_get(reason, lambda x: x['text'], compat_str)
2168 if reason_text:
2169 if not error_message:
2170 error_message = ''
2171 error_message += reason_text
2172 if error_message:
2173 error_message = clean_html(error_message)
c3e54389 2174 if not error_message:
13577349
S
2175 error_message = clean_html(try_get(
2176 player_response, lambda x: x['playabilityStatus']['reason'],
2177 compat_str))
2178 if not error_message:
2179 error_message = clean_html(
2180 try_get(video_info, lambda x: x['reason'][0], compat_str))
c3e54389
S
2181 if error_message:
2182 raise ExtractorError(error_message, expected=True)
2183 raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 2184
7e72694b 2185 # uploader
dbdaaa23
S
2186 video_uploader = try_get(
2187 video_info, lambda x: x['author'][0],
2188 compat_str) or str_or_none(video_details.get('author'))
7e72694b
S
2189 if video_uploader:
2190 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
2191 else:
2192 self._downloader.report_warning('unable to extract uploader name')
2193
2194 # uploader_id
2195 video_uploader_id = None
2196 video_uploader_url = None
2197 mobj = re.search(
2198 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
2199 video_webpage)
2200 if mobj is not None:
2201 video_uploader_id = mobj.group('uploader_id')
2202 video_uploader_url = mobj.group('uploader_url')
a6211d23
S
2203 else:
2204 owner_profile_url = url_or_none(microformat.get('ownerProfileUrl'))
2205 if owner_profile_url:
2206 video_uploader_id = self._search_regex(
2207 r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id',
2208 default=None)
2209 video_uploader_url = owner_profile_url
7e72694b 2210
b45a9e69 2211 channel_id = (
3089bc74
S
2212 str_or_none(video_details.get('channelId'))
2213 or self._html_search_meta(
2214 'channelId', video_webpage, 'channel id', default=None)
2215 or self._search_regex(
b45a9e69 2216 r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
2217 video_webpage, 'channel id', default=None, group='id'))
dd4c4492
S
2218 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
2219
b477fc13
S
2220 thumbnails = []
2221 thumbnails_list = try_get(
2222 video_details, lambda x: x['thumbnail']['thumbnails'], list) or []
2223 for t in thumbnails_list:
2224 if not isinstance(t, dict):
2225 continue
2226 thumbnail_url = url_or_none(t.get('url'))
2227 if not thumbnail_url:
2228 continue
2229 thumbnails.append({
2230 'url': thumbnail_url,
2231 'width': int_or_none(t.get('width')),
2232 'height': int_or_none(t.get('height')),
2233 })
2234
2235 if not thumbnails:
7e72694b 2236 video_thumbnail = None
b477fc13
S
2237 # We try first to get a high quality image:
2238 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
2239 video_webpage, re.DOTALL)
2240 if m_thumb is not None:
2241 video_thumbnail = m_thumb.group(1)
2242 thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str)
2243 if thumbnail_url:
2244 video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url)
2245 if video_thumbnail:
2246 thumbnails.append({'url': video_thumbnail})
7e72694b
S
2247
2248 # upload date
2249 upload_date = self._html_search_meta(
2250 'datePublished', video_webpage, 'upload date', default=None)
2251 if not upload_date:
2252 upload_date = self._search_regex(
2253 [r'(?s)id="eow-date.*?>(.*?)</span>',
2254 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
2255 video_webpage, 'upload date', default=None)
37357d21
S
2256 if not upload_date:
2257 upload_date = microformat.get('publishDate') or microformat.get('uploadDate')
7e72694b
S
2258 upload_date = unified_strdate(upload_date)
2259
2260 video_license = self._html_search_regex(
2261 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
2262 video_webpage, 'license', default=None)
2263
2264 m_music = re.search(
2265 r'''(?x)
2266 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
2267 <ul[^>]*>\s*
2268 <li>(?P<title>.+?)
2269 by (?P<creator>.+?)
2270 (?:
2271 \(.+?\)|
2272 <a[^>]*
2273 (?:
2274 \bhref=["\']/red[^>]*>| # drop possible
2275 >\s*Listen ad-free with YouTube Red # YouTube Red ad
2276 )
2277 .*?
2278 )?</li
2279 ''',
2280 video_webpage)
2281 if m_music:
2282 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
2283 video_creator = clean_html(m_music.group('creator'))
2284 else:
2285 video_alt_title = video_creator = None
2286
2287 def extract_meta(field):
2288 return self._html_search_regex(
2289 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
2290 video_webpage, field, default=None)
2291
2292 track = extract_meta('Song')
2293 artist = extract_meta('Artist')
92bc97d3 2294 album = extract_meta('Album')
822b9d9c
RA
2295
2296 # Youtube Music Auto-generated description
92bc97d3 2297 release_date = release_year = None
822b9d9c
RA
2298 if video_description:
2299 mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description)
2300 if mobj:
2301 if not track:
2302 track = mobj.group('track').strip()
2303 if not artist:
2304 artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))
92bc97d3
RA
2305 if not album:
2306 album = mobj.group('album'.strip())
822b9d9c
RA
2307 release_year = mobj.group('release_year')
2308 release_date = mobj.group('release_date')
2309 if release_date:
2310 release_date = release_date.replace('-', '')
2311 if not release_year:
2312 release_year = int(release_date[:4])
2313 if release_year:
2314 release_year = int(release_year)
7e72694b 2315
9322f116 2316 yt_initial = self._get_yt_initial_data(video_id, video_webpage)
2317 if yt_initial:
2318 music_metadata = self._get_music_metadata_from_yt_initial(yt_initial)
2319 if len(music_metadata):
2320 album = music_metadata[0].get('album')
2321 artist = music_metadata[0].get('artist')
2322 track = music_metadata[0].get('track')
2323
7e72694b
S
2324 m_episode = re.search(
2325 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
2326 video_webpage)
2327 if m_episode:
c2dd2dc0 2328 series = unescapeHTML(m_episode.group('series'))
7e72694b
S
2329 season_number = int(m_episode.group('season'))
2330 episode_number = int(m_episode.group('episode'))
2331 else:
2332 series = season_number = episode_number = None
2333
2334 m_cat_container = self._search_regex(
2335 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
2336 video_webpage, 'categories', default=None)
dbeafce5 2337 category = None
7e72694b
S
2338 if m_cat_container:
2339 category = self._html_search_regex(
2340 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
2341 default=None)
dbeafce5
S
2342 if not category:
2343 category = try_get(
2344 microformat, lambda x: x['category'], compat_str)
2345 video_categories = None if category is None else [category]
7e72694b
S
2346
2347 video_tags = [
2348 unescapeHTML(m.group('content'))
2349 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
dbeafce5
S
2350 if not video_tags:
2351 video_tags = try_get(video_details, lambda x: x['keywords'], list)
7e72694b
S
2352
2353 def _extract_count(count_name):
2354 return str_to_int(self._search_regex(
a0566bbf 2355 (r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' % re.escape(count_name),
2356 r'["\']label["\']\s*:\s*["\']([\d,.]+)\s+%ss["\']' % re.escape(count_name)),
7e72694b
S
2357 video_webpage, count_name, default=None))
2358
2359 like_count = _extract_count('like')
2360 dislike_count = _extract_count('dislike')
2361
dbdaaa23
S
2362 if view_count is None:
2363 view_count = str_to_int(self._search_regex(
2364 r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
2365 'view count', default=None))
2366
bf3c9326
S
2367 average_rating = (
2368 float_or_none(video_details.get('averageRating'))
2369 or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
2370
7e72694b 2371 # subtitles
321bf820 2372 video_subtitles = self.extract_subtitles(
2373 video_id, video_webpage, has_live_chat_replay)
7e72694b
S
2374 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
2375
2376 video_duration = try_get(
2377 video_info, lambda x: int_or_none(x['length_seconds'][0]))
dbdaaa23
S
2378 if not video_duration:
2379 video_duration = int_or_none(video_details.get('lengthSeconds'))
7e72694b
S
2380 if not video_duration:
2381 video_duration = parse_duration(self._html_search_meta(
2382 'duration', video_webpage, 'video duration'))
2383
b84071c0
JP
2384 # Get Subscriber Count of channel
2385 subscriber_count = parse_count(self._search_regex(
2386 r'"text":"([\d\.]+\w?) subscribers"',
2387 video_webpage,
2388 'subscriber count',
2389 default=None
2390 ))
2391
7e72694b
S
2392 # annotations
2393 video_annotations = None
2394 if self._downloader.params.get('writeannotations', False):
64b6a4e9
RA
2395 xsrf_token = self._search_regex(
2396 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2',
2397 video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
2398 invideo_url = try_get(
2399 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
2400 if xsrf_token and invideo_url:
2401 xsrf_field_name = self._search_regex(
2402 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
2403 video_webpage, 'xsrf field name',
2404 group='xsrf_field_name', default='session_token')
2405 video_annotations = self._download_webpage(
2406 self._proto_relative_url(invideo_url),
2407 video_id, note='Downloading annotations',
2408 errnote='Unable to download video annotations', fatal=False,
2409 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
7e72694b 2410
84213ea8 2411 chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
7e72694b 2412
dd27fd17 2413 # Look for the DASH manifest
203fb43f 2414 if self._downloader.params.get('youtube_include_dash_manifest', True):
77c6fb5b 2415 dash_mpd_fatal = True
8ff648e4 2416 for mpd_url in dash_mpds:
d8d24a92 2417 dash_formats = {}
774e208f 2418 try:
05d0d131
YCH
2419 def decrypt_sig(mobj):
2420 s = mobj.group(1)
2421 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2422 return '/signature/%s' % dec_s
2423
8ff648e4 2424 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2d2fa82d 2425
8ff648e4 2426 for df in self._extract_mpd_formats(
2427 mpd_url, video_id, fatal=dash_mpd_fatal,
2428 formats_dict=self._formats):
c63ca0ee
S
2429 if not df.get('filesize'):
2430 df['filesize'] = _extract_filesize(df['url'])
d8d24a92
S
2431 # Do not overwrite DASH format found in some previous DASH manifest
2432 if df['format_id'] not in dash_formats:
2433 dash_formats[df['format_id']] = df
77c6fb5b
S
2434 # Additional DASH manifests may end up in HTTP Error 403 therefore
2435 # allow them to fail without bug report message if we already have
2436 # some DASH manifest succeeded. This is temporary workaround to reduce
2437 # burst of bug reports until we figure out the reason and whether it
2438 # can be fixed at all.
2439 dash_mpd_fatal = False
774e208f
PH
2440 except (ExtractorError, KeyError) as e:
2441 self.report_warning(
2442 'Skipping DASH manifest: %r' % e, video_id)
d8d24a92 2443 if dash_formats:
04b3b3df
JMF
2444 # Remove the formats we found through non-DASH, they
2445 # contain less info and it can be wrong, because we use
2446 # fixed values (for example the resolution). See
067aa17e 2447 # https://github.com/ytdl-org/youtube-dl/issues/5774 for an
04b3b3df 2448 # example.
d80265cc 2449 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
d8d24a92 2450 formats.extend(dash_formats.values())
d80044c2 2451
6271f1ca
PH
2452 # Check for malformed aspect ratio
2453 stretched_m = re.search(
2454 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2455 video_webpage)
2456 if stretched_m:
313dfc45
LL
2457 w = float(stretched_m.group('w'))
2458 h = float(stretched_m.group('h'))
5faf9fed
S
2459 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2460 # We will only process correct ratios.
313dfc45 2461 if w > 0 and h > 0:
41f24c32 2462 ratio = w / h
313dfc45
LL
2463 for f in formats:
2464 if f.get('vcodec') != 'none':
2465 f['stretched_ratio'] = ratio
6271f1ca 2466
026fbedc 2467 if not formats:
43ebf77d
S
2468 if 'reason' in video_info:
2469 if 'The uploader has not made this video available in your country.' in video_info['reason']:
2470 regions_allowed = self._html_search_meta(
2471 'regionsAllowed', video_webpage, default=None)
2472 countries = regions_allowed.split(',') if regions_allowed else None
2473 self.raise_geo_restricted(
2474 msg=video_info['reason'][0], countries=countries)
2475 reason = video_info['reason'][0]
2476 if 'Invalid parameters' in reason:
2477 unavailable_message = extract_unavailable_message()
2478 if unavailable_message:
2479 reason = unavailable_message
2480 raise ExtractorError(
2481 'YouTube said: %s' % reason,
2482 expected=True, video_id=video_id)
2483 if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']):
2484 raise ExtractorError('This video is DRM protected.', expected=True)
0d297518 2485
4bcc7bd1 2486 self._sort_formats(formats)
4ea3be0a 2487
21c340b8 2488 self.mark_watched(video_id, video_info, player_response)
d77ab8e2 2489
4ea3be0a 2490 return {
8bcc8756
JW
2491 'id': video_id,
2492 'uploader': video_uploader,
2493 'uploader_id': video_uploader_id,
fd050249 2494 'uploader_url': video_uploader_url,
dd4c4492
S
2495 'channel_id': channel_id,
2496 'channel_url': channel_url,
8bcc8756 2497 'upload_date': upload_date,
7caf9830 2498 'license': video_license,
936784b2 2499 'creator': video_creator or artist,
8bcc8756 2500 'title': video_title,
936784b2 2501 'alt_title': video_alt_title or track,
b477fc13 2502 'thumbnails': thumbnails,
8bcc8756
JW
2503 'description': video_description,
2504 'categories': video_categories,
000b6b5a 2505 'tags': video_tags,
8bcc8756 2506 'subtitles': video_subtitles,
360e1ca5 2507 'automatic_captions': automatic_captions,
8bcc8756
JW
2508 'duration': video_duration,
2509 'age_limit': 18 if age_gate else 0,
2510 'annotations': video_annotations,
9cafc3fd 2511 'chapters': chapters,
7e8c0af0 2512 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 2513 'view_count': view_count,
4ea3be0a 2514 'like_count': like_count,
2515 'dislike_count': dislike_count,
bf3c9326 2516 'average_rating': average_rating,
8bcc8756 2517 'formats': formats,
2fe1ff85 2518 'is_live': is_live,
7c80519c 2519 'start_time': start_time,
297a564b 2520 'end_time': end_time,
12afdc2a
S
2521 'series': series,
2522 'season_number': season_number,
2523 'episode_number': episode_number,
936784b2
S
2524 'track': track,
2525 'artist': artist,
5caabd3c 2526 'album': album,
2527 'release_date': release_date,
2528 'release_year': release_year,
b84071c0 2529 'subscriber_count': subscriber_count,
4ea3be0a 2530 }
c5e8d7af 2531
5f6a1245 2532
8bdd16b4 2533class YoutubeTabIE(YoutubeBaseInfoExtractor):
2534 IE_DESC = 'YouTube.com tab'
3462ffa8 2535 # (?x)^ will cause warning in LiveIE. So I cant split this into multiple lines using '''
2536 _VALID_URL = (
2537 r'https?://(?:\w+\.)?(?:youtube(?:kids)?\.com|invidio\.us)/'
2538 r'(?:(?!(%s)([/#?]|$))|'
2539 r'(?:channel|c|user)/|'
2540 r'(?:playlist|watch)\?.*?\blist=)'
2541 r'(?P<id>[^/?#&]+)') % YoutubeBaseInfoExtractor._RESERVED_NAMES
8bdd16b4 2542 IE_NAME = 'youtube:tab'
2543
81127aa5 2544 _TESTS = [{
8bdd16b4 2545 # playlists, multipage
2546 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid',
2547 'playlist_mincount': 94,
2548 'info_dict': {
2549 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2550 'title': 'Игорь Клейнер - Playlists',
2551 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
2552 },
2553 }, {
2554 # playlists, multipage, different order
2555 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
2556 'playlist_mincount': 94,
2557 'info_dict': {
2558 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2559 'title': 'Игорь Клейнер - Playlists',
2560 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
2561 },
2562 }, {
2563 # playlists, singlepage
2564 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
2565 'playlist_mincount': 4,
2566 'info_dict': {
2567 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
2568 'title': 'ThirstForScience - Playlists',
2569 'description': 'md5:609399d937ea957b0f53cbffb747a14c',
2570 }
2571 }, {
2572 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
2573 'only_matching': True,
2574 }, {
2575 # basic, single video playlist
0e30a7b9 2576 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
81127aa5 2577 'info_dict': {
0e30a7b9 2578 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2579 'uploader': 'Sergey M.',
2580 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3867038a 2581 'title': 'youtube-dl public playlist',
81127aa5 2582 },
0e30a7b9 2583 'playlist_count': 1,
9291475f 2584 }, {
8bdd16b4 2585 # empty playlist
0e30a7b9 2586 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
9291475f 2587 'info_dict': {
0e30a7b9 2588 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2589 'uploader': 'Sergey M.',
2590 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
3867038a 2591 'title': 'youtube-dl empty playlist',
9291475f
PH
2592 },
2593 'playlist_count': 0,
2594 }, {
8bdd16b4 2595 # Home tab
2596 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured',
9291475f 2597 'info_dict': {
8bdd16b4 2598 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2599 'title': 'lex will - Home',
2600 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
9291475f 2601 },
8bdd16b4 2602 'playlist_mincount': 2,
9291475f 2603 }, {
8bdd16b4 2604 # Videos tab
2605 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos',
9291475f 2606 'info_dict': {
8bdd16b4 2607 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2608 'title': 'lex will - Videos',
2609 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
9291475f 2610 },
8bdd16b4 2611 'playlist_mincount': 975,
9291475f 2612 }, {
8bdd16b4 2613 # Videos tab, sorted by popular
2614 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid',
9291475f 2615 'info_dict': {
8bdd16b4 2616 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2617 'title': 'lex will - Videos',
2618 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
9291475f 2619 },
8bdd16b4 2620 'playlist_mincount': 199,
9291475f 2621 }, {
8bdd16b4 2622 # Playlists tab
2623 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists',
9291475f 2624 'info_dict': {
8bdd16b4 2625 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2626 'title': 'lex will - Playlists',
2627 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
9291475f 2628 },
8bdd16b4 2629 'playlist_mincount': 17,
ac7553d0 2630 }, {
8bdd16b4 2631 # Community tab
2632 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community',
ac7553d0 2633 'info_dict': {
8bdd16b4 2634 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2635 'title': 'lex will - Community',
2636 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2637 },
2638 'playlist_mincount': 18,
87dadd45 2639 }, {
8bdd16b4 2640 # Channels tab
2641 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels',
87dadd45 2642 'info_dict': {
8bdd16b4 2643 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2644 'title': 'lex will - Channels',
2645 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2646 },
2647 'playlist_mincount': 138,
6b08cdf6 2648 }, {
a0566bbf 2649 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA',
8bdd16b4 2650 'only_matching': True,
2651 }, {
a0566bbf 2652 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
8bdd16b4 2653 'only_matching': True,
2654 }, {
a0566bbf 2655 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
8bdd16b4 2656 'only_matching': True,
2657 }, {
2658 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2659 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2660 'info_dict': {
2661 'title': '29C3: Not my department',
2662 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2663 'uploader': 'Christiaan008',
2664 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
2665 },
2666 'playlist_count': 96,
2667 }, {
2668 'note': 'Large playlist',
2669 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
6b08cdf6 2670 'info_dict': {
8bdd16b4 2671 'title': 'Uploads from Cauchemar',
2672 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
2673 'uploader': 'Cauchemar',
2674 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
13a75688 2675 },
8bdd16b4 2676 'playlist_mincount': 1123,
2677 }, {
2678 # even larger playlist, 8832 videos
2679 'url': 'http://www.youtube.com/user/NASAgovVideo/videos',
2680 'only_matching': True,
4b7df0d3
JMF
2681 }, {
2682 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2683 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2684 'info_dict': {
acf757f4
PH
2685 'title': 'Uploads from Interstellar Movie',
2686 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
13a75688 2687 'uploader': 'Interstellar Movie',
8bdd16b4 2688 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
4b7df0d3 2689 },
481cc733 2690 'playlist_mincount': 21,
8bdd16b4 2691 }, {
2692 # https://github.com/ytdl-org/youtube-dl/issues/21844
2693 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2694 'info_dict': {
2695 'title': 'Data Analysis with Dr Mike Pound',
2696 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2697 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
2698 'uploader': 'Computerphile',
2699 },
2700 'playlist_mincount': 11,
2701 }, {
a0566bbf 2702 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
8bdd16b4 2703 'only_matching': True,
dacb3a86
S
2704 }, {
2705 # Playlist URL that does not actually serve a playlist
2706 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2707 'info_dict': {
2708 'id': 'FqZTN594JQw',
2709 'ext': 'webm',
2710 'title': "Smiley's People 01 detective, Adventure Series, Action",
2711 'uploader': 'STREEM',
2712 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
ec85ded8 2713 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
dacb3a86
S
2714 'upload_date': '20150526',
2715 'license': 'Standard YouTube License',
2716 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2717 'categories': ['People & Blogs'],
2718 'tags': list,
dbdaaa23 2719 'view_count': int,
dacb3a86
S
2720 'like_count': int,
2721 'dislike_count': int,
2722 },
2723 'params': {
2724 'skip_download': True,
2725 },
13a75688 2726 'skip': 'This video is not available.',
dacb3a86 2727 'add_ie': [YoutubeIE.ie_key()],
481cc733 2728 }, {
8bdd16b4 2729 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
c0345b82 2730 'only_matching': True,
66b48727 2731 }, {
8bdd16b4 2732 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
66b48727 2733 'only_matching': True,
a0566bbf 2734 }, {
2735 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live',
2736 'info_dict': {
2737 'id': '9Auq9mYxFEE',
2738 'ext': 'mp4',
2739 'title': 'Watch Sky News live',
2740 'uploader': 'Sky News',
2741 'uploader_id': 'skynews',
2742 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews',
2743 'upload_date': '20191102',
2744 'description': 'md5:78de4e1c2359d0ea3ed829678e38b662',
2745 'categories': ['News & Politics'],
2746 'tags': list,
2747 'like_count': int,
2748 'dislike_count': int,
2749 },
2750 'params': {
2751 'skip_download': True,
2752 },
2753 }, {
2754 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
2755 'info_dict': {
2756 'id': 'a48o2S1cPoo',
2757 'ext': 'mp4',
2758 'title': 'The Young Turks - Live Main Show',
2759 'uploader': 'The Young Turks',
2760 'uploader_id': 'TheYoungTurks',
2761 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
2762 'upload_date': '20150715',
2763 'license': 'Standard YouTube License',
2764 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
2765 'categories': ['News & Politics'],
2766 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
2767 'like_count': int,
2768 'dislike_count': int,
2769 },
2770 'params': {
2771 'skip_download': True,
2772 },
2773 'only_matching': True,
2774 }, {
2775 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
2776 'only_matching': True,
2777 }, {
2778 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
2779 'only_matching': True,
2780 },
2781 # TODO
2782 # {
2783 # 'url': 'https://www.youtube.com/TheYoungTurks/live',
2784 # 'only_matching': True,
2785 # }
2786 ]
8bdd16b4 2787
2788 def _extract_channel_id(self, webpage):
2789 channel_id = self._html_search_meta(
2790 'channelId', webpage, 'channel id', default=None)
2791 if channel_id:
2792 return channel_id
2793 channel_url = self._html_search_meta(
2794 ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url',
2795 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad',
2796 'twitter:app:url:googleplay'), webpage, 'channel url')
2797 return self._search_regex(
2798 r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+',
2799 channel_url, 'channel id')
15f6397c 2800
8bdd16b4 2801 @staticmethod
2802 def _extract_grid_item_renderer(item):
2803 for item_kind in ('Playlist', 'Video', 'Channel'):
2804 renderer = item.get('grid%sRenderer' % item_kind)
2805 if renderer:
2806 return renderer
2807
2808 def _extract_video(self, renderer):
2809 video_id = renderer.get('videoId')
2810 title = try_get(
2811 renderer,
2812 (lambda x: x['title']['runs'][0]['text'],
2813 lambda x: x['title']['simpleText']), compat_str)
2814 description = try_get(
2815 renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'],
2816 compat_str)
2817 duration = parse_duration(try_get(
2818 renderer, lambda x: x['lengthText']['simpleText'], compat_str))
2819 view_count_text = try_get(
2820 renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
2821 view_count = str_to_int(self._search_regex(
2822 r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
2823 'view count', default=None))
2824 uploader = try_get(
2825 renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
2826 return {
2827 '_type': 'url_transparent',
2828 'ie_key': YoutubeIE.ie_key(),
2829 'id': video_id,
2830 'url': video_id,
2831 'title': title,
2832 'description': description,
2833 'duration': duration,
2834 'view_count': view_count,
2835 'uploader': uploader,
2836 }
652cdaa2 2837
8bdd16b4 2838 def _grid_entries(self, grid_renderer):
2839 for item in grid_renderer['items']:
2840 if not isinstance(item, dict):
39b62db1 2841 continue
8bdd16b4 2842 renderer = self._extract_grid_item_renderer(item)
2843 if not isinstance(renderer, dict):
2844 continue
2845 title = try_get(
2846 renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
2847 # playlist
2848 playlist_id = renderer.get('playlistId')
2849 if playlist_id:
2850 yield self.url_result(
2851 'https://www.youtube.com/playlist?list=%s' % playlist_id,
2852 ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
2853 video_title=title)
2854 # video
2855 video_id = renderer.get('videoId')
2856 if video_id:
2857 yield self._extract_video(renderer)
2858 # channel
2859 channel_id = renderer.get('channelId')
2860 if channel_id:
2861 title = try_get(
2862 renderer, lambda x: x['title']['simpleText'], compat_str)
2863 yield self.url_result(
2864 'https://www.youtube.com/channel/%s' % channel_id,
2865 ie=YoutubeTabIE.ie_key(), video_title=title)
2866
2867 def _shelf_entries_trimmed(self, shelf_renderer):
2868 renderer = try_get(
2869 shelf_renderer, lambda x: x['content']['horizontalListRenderer'], dict)
2870 if not renderer:
2871 return
2872 # TODO: add support for nested playlists so each shelf is processed
2873 # as separate playlist
2874 # TODO: this includes only first N items
2875 for entry in self._grid_entries(renderer):
2876 yield entry
2877
2878 def _shelf_entries(self, shelf_renderer):
2879 ep = try_get(
2880 shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
2881 compat_str)
2882 shelf_url = urljoin('https://www.youtube.com', ep)
2883 if not shelf_url:
2884 return
2885 title = try_get(
2886 shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
2887 yield self.url_result(shelf_url, video_title=title)
c5e8d7af 2888
8bdd16b4 2889 def _playlist_entries(self, video_list_renderer):
2890 for content in video_list_renderer['contents']:
2891 if not isinstance(content, dict):
2892 continue
2893 renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer')
2894 if not isinstance(renderer, dict):
2895 continue
2896 video_id = renderer.get('videoId')
2897 if not video_id:
2898 continue
2899 yield self._extract_video(renderer)
07aeced6 2900
3462ffa8 2901 def _itemSection_entries(self, item_sect_renderer):
2902 for content in item_sect_renderer['contents']:
2903 if not isinstance(content, dict):
2904 continue
2905 renderer = content.get('videoRenderer', {})
2906 if not isinstance(renderer, dict):
2907 continue
2908 video_id = renderer.get('videoId')
2909 if not video_id:
2910 continue
2911 yield self._extract_video(renderer)
2912
2913 def _rich_entries(self, rich_grid_renderer):
2914 renderer = try_get(
2915 rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict)
2916 video_id = renderer.get('videoId')
2917 if not video_id:
2918 return
2919 yield self._extract_video(renderer)
2920
8bdd16b4 2921 def _video_entry(self, video_renderer):
2922 video_id = video_renderer.get('videoId')
2923 if video_id:
2924 return self._extract_video(video_renderer)
dacb3a86 2925
8bdd16b4 2926 def _post_thread_entries(self, post_thread_renderer):
2927 post_renderer = try_get(
2928 post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict)
2929 if not post_renderer:
2930 return
2931 # video attachment
2932 video_renderer = try_get(
2933 post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict)
2934 video_id = None
2935 if video_renderer:
2936 entry = self._video_entry(video_renderer)
2937 if entry:
2938 yield entry
2939 # inline video links
2940 runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or []
2941 for run in runs:
2942 if not isinstance(run, dict):
2943 continue
2944 ep_url = try_get(
2945 run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str)
2946 if not ep_url:
2947 continue
2948 if not YoutubeIE.suitable(ep_url):
2949 continue
2950 ep_video_id = YoutubeIE._match_id(ep_url)
2951 if video_id == ep_video_id:
2952 continue
2953 yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=video_id)
dacb3a86 2954
8bdd16b4 2955 def _post_thread_continuation_entries(self, post_thread_continuation):
2956 contents = post_thread_continuation.get('contents')
2957 if not isinstance(contents, list):
2958 return
2959 for content in contents:
2960 renderer = content.get('backstagePostThreadRenderer')
2961 if not isinstance(renderer, dict):
2962 continue
2963 for entry in self._post_thread_entries(renderer):
2964 yield entry
07aeced6 2965
8bdd16b4 2966 @staticmethod
2967 def _extract_next_continuation_data(renderer):
2968 next_continuation = try_get(
2969 renderer, lambda x: x['continuations'][0]['nextContinuationData'], dict)
2970 if not next_continuation:
2971 return
2972 continuation = next_continuation.get('continuation')
2973 if not continuation:
2974 return
2975 ctp = next_continuation.get('clickTrackingParams')
2976 return {
2977 'ctoken': continuation,
2978 'continuation': continuation,
2979 'itct': ctp,
2980 }
c5e8d7af 2981
8bdd16b4 2982 @classmethod
2983 def _extract_continuation(cls, renderer):
2984 next_continuation = cls._extract_next_continuation_data(renderer)
2985 if next_continuation:
2986 return next_continuation
2987 contents = renderer.get('contents')
2988 if not isinstance(contents, list):
2989 return
2990 for content in contents:
2991 if not isinstance(content, dict):
2992 continue
2993 continuation_ep = try_get(
2994 content, lambda x: x['continuationItemRenderer']['continuationEndpoint'],
2995 dict)
2996 if not continuation_ep:
2997 continue
2998 continuation = try_get(
2999 continuation_ep, lambda x: x['continuationCommand']['token'], compat_str)
3000 if not continuation:
3001 continue
3002 ctp = continuation_ep.get('clickTrackingParams')
3003 if not ctp:
3004 continue
3005 return {
3006 'ctoken': continuation,
3007 'continuation': continuation,
3008 'itct': ctp,
3009 }
448830ce 3010
8bdd16b4 3011 def _entries(self, tab, identity_token):
3462ffa8 3012
3013 def extract_entries(parent_renderer):
3014 slr_contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
3015 for slr_content in slr_contents:
3016 if not isinstance(slr_content, dict):
8bdd16b4 3017 continue
3462ffa8 3018 is_renderer = try_get(slr_content, lambda x: x['itemSectionRenderer'], dict)
3019 if not is_renderer:
3020 renderer = slr_content.get('richItemRenderer')
3021 if renderer:
3022 for entry in self._rich_entries(renderer):
3023 yield entry
3024 continuation_list[0] = self._extract_continuation(parent_renderer)
8bdd16b4 3025 continue
3462ffa8 3026 isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
3027 for isr_content in isr_contents:
3028 if not isinstance(isr_content, dict):
3029 continue
3030 renderer = isr_content.get('playlistVideoListRenderer')
3031 if renderer:
3032 for entry in self._playlist_entries(renderer):
3033 yield entry
3034 continuation_list[0] = self._extract_continuation(renderer)
3035 continue
3036 renderer = isr_content.get('gridRenderer')
3037 if renderer:
3038 for entry in self._grid_entries(renderer):
3039 yield entry
3040 continuation_list[0] = self._extract_continuation(renderer)
3041 continue
3042 renderer = isr_content.get('shelfRenderer')
3043 if renderer:
3044 for entry in self._shelf_entries(renderer):
3045 yield entry
3046 continuation_list[0] = self._extract_continuation(parent_renderer)
3047 continue
3048 renderer = isr_content.get('backstagePostThreadRenderer')
3049 if renderer:
3050 for entry in self._post_thread_entries(renderer):
3051 yield entry
3052 continuation_list[0] = self._extract_continuation(renderer)
3053 continue
3054 renderer = isr_content.get('videoRenderer')
3055 if renderer:
3056 entry = self._video_entry(renderer)
3057 if entry:
3058 yield entry
3059 if not continuation_list[0]:
3060 continuation_list[0] = self._extract_continuation(is_renderer)
3061 if not continuation_list[0]:
3062 continuation_list[0] = self._extract_continuation(parent_renderer)
3063
3064 continuation_list = [None] # Python 2 doesnot support nonlocal
3065 parent_renderer = (
3066 try_get(tab, lambda x: x['sectionListRenderer'], dict)
3067 or try_get(tab, lambda x: x['richGridRenderer'], dict) or {})
3068 if parent_renderer:
3069 for entry in extract_entries(parent_renderer):
3070 yield entry
8bdd16b4 3071
3462ffa8 3072 continuation = continuation_list[0]
8bdd16b4 3073
3074 headers = {
3075 'x-youtube-client-name': '1',
3076 'x-youtube-client-version': '2.20201112.04.01',
3077 }
3078 if identity_token:
3079 headers['x-youtube-identity-token'] = identity_token
ebf1b291 3080
8bdd16b4 3081 for page_num in itertools.count(1):
3082 if not continuation:
3083 break
3462ffa8 3084 if hasattr(self, '_MAX_PAGES') and page_num > self._MAX_PAGES:
3085 break
8bdd16b4 3086 browse = self._download_json(
3087 'https://www.youtube.com/browse_ajax', None,
3088 'Downloading page %d' % page_num,
3089 headers=headers, query=continuation, fatal=False)
3090 if not browse:
3091 break
3092 response = try_get(browse, lambda x: x[1]['response'], dict)
3093 if not response:
3094 break
ebf1b291 3095
8bdd16b4 3096 continuation_contents = try_get(
3097 response, lambda x: x['continuationContents'], dict)
3098 if continuation_contents:
3099 continuation_renderer = continuation_contents.get('playlistVideoListContinuation')
3100 if continuation_renderer:
3101 for entry in self._playlist_entries(continuation_renderer):
3102 yield entry
3103 continuation = self._extract_continuation(continuation_renderer)
3104 continue
3105 continuation_renderer = continuation_contents.get('gridContinuation')
3106 if continuation_renderer:
3107 for entry in self._grid_entries(continuation_renderer):
3108 yield entry
3109 continuation = self._extract_continuation(continuation_renderer)
3110 continue
3111 continuation_renderer = continuation_contents.get('itemSectionContinuation')
3112 if continuation_renderer:
3113 for entry in self._post_thread_continuation_entries(continuation_renderer):
3114 yield entry
3115 continuation = self._extract_continuation(continuation_renderer)
3116 continue
3462ffa8 3117 continuation_renderer = continuation_contents.get('sectionListContinuation')
3118 if continuation_renderer:
3119 continuation_list = [None]
3120 for entry in extract_entries(continuation_renderer):
3121 yield entry
3122 continuation = continuation_list[0]
3123 continue
c5e8d7af 3124
8bdd16b4 3125 continuation_items = try_get(
3126 response, lambda x: x['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems'], list)
3127 if continuation_items:
3128 continuation_item = continuation_items[0]
3129 if not isinstance(continuation_item, dict):
3130 continue
3131 renderer = continuation_item.get('playlistVideoRenderer')
3132 if renderer:
3133 video_list_renderer = {'contents': continuation_items}
3134 for entry in self._playlist_entries(video_list_renderer):
3135 yield entry
3136 continuation = self._extract_continuation(video_list_renderer)
3137 continue
3462ffa8 3138 renderer = continuation_item.get('itemSectionRenderer')
3139 if renderer:
3140 for entry in self._itemSection_entries(renderer):
3141 yield entry
3142 continuation = self._extract_continuation({'contents': continuation_items})
3143 continue
8bdd16b4 3144 break
9558dcec 3145
8bdd16b4 3146 @staticmethod
3147 def _extract_selected_tab(tabs):
3148 for tab in tabs:
3149 if try_get(tab, lambda x: x['tabRenderer']['selected'], bool):
3150 return tab['tabRenderer']
2b3c2546 3151 else:
8bdd16b4 3152 raise ExtractorError('Unable to find selected tab')
b82f815f 3153
8bdd16b4 3154 @staticmethod
3155 def _extract_uploader(data):
3156 uploader = {}
3157 sidebar_renderer = try_get(
3158 data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list)
3159 if sidebar_renderer:
3160 for item in sidebar_renderer:
3161 if not isinstance(item, dict):
3162 continue
3163 renderer = item.get('playlistSidebarSecondaryInfoRenderer')
3164 if not isinstance(renderer, dict):
3165 continue
3166 owner = try_get(
3167 renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict)
3168 if owner:
3169 uploader['uploader'] = owner.get('text')
3170 uploader['uploader_id'] = try_get(
3171 owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str)
3172 uploader['uploader_url'] = urljoin(
3173 'https://www.youtube.com/',
3174 try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
3175 return uploader
3176
3177 def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
3178 selected_tab = self._extract_selected_tab(tabs)
3179 renderer = try_get(
3180 data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
3462ffa8 3181 playlist_id = None
8bdd16b4 3182 if renderer:
3183 channel_title = renderer.get('title') or item_id
3184 tab_title = selected_tab.get('title')
3185 title = channel_title or item_id
3186 if tab_title:
3187 title += ' - %s' % tab_title
3188 description = renderer.get('description')
3189 playlist_id = renderer.get('externalId')
3190 renderer = try_get(
3191 data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
3192 if renderer:
3193 title = renderer.get('title')
3194 description = None
3195 playlist_id = item_id
3462ffa8 3196 if playlist_id is None:
3197 return None
8bdd16b4 3198 playlist = self.playlist_result(
3199 self._entries(selected_tab['content'], identity_token),
3200 playlist_id=playlist_id, playlist_title=title,
3201 playlist_description=description)
3202 playlist.update(self._extract_uploader(data))
3203 return playlist
73c4ac2c 3204
8bdd16b4 3205 def _extract_from_playlist(self, item_id, data, playlist):
3206 title = playlist.get('title') or try_get(
3207 data, lambda x: x['titleText']['simpleText'], compat_str)
3208 playlist_id = playlist.get('playlistId') or item_id
3209 return self.playlist_result(
3210 self._playlist_entries(playlist), playlist_id=playlist_id,
3211 playlist_title=title)
c5e8d7af 3212
8bdd16b4 3213 def _real_extract(self, url):
3214 item_id = self._match_id(url)
3215 url = compat_urlparse.urlunparse(
3216 compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
3217 # Handle both video/playlist URLs
3218 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3219 video_id = qs.get('v', [None])[0]
3220 playlist_id = qs.get('list', [None])[0]
3221 if video_id and playlist_id:
3222 if self._downloader.params.get('noplaylist'):
3223 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
3224 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
3225 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
3226 webpage = self._download_webpage(url, item_id)
3227 identity_token = self._search_regex(
a0566bbf 3228 r'\bID_TOKEN["\']\s*:\s/l*["\'](.+?)["\']', webpage,
8bdd16b4 3229 'identity token', default=None)
3230 data = self._extract_yt_initial_data(item_id, webpage)
3231 tabs = try_get(
3232 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
3233 if tabs:
3234 return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token)
3235 playlist = try_get(
3236 data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
3237 if playlist:
3238 return self._extract_from_playlist(item_id, data, playlist)
a0566bbf 3239 # Fallback to video extraction if no playlist alike page is recognized.
3240 # First check for the current video then try the v attribute of URL query.
3241 video_id = try_get(
3242 data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'],
3243 compat_str) or video_id
8bdd16b4 3244 if video_id:
3245 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
3246 # Failed to recognize
3247 raise ExtractorError('Unable to recognize tab page')
c5e8d7af 3248
c5e8d7af 3249
8bdd16b4 3250class YoutubePlaylistIE(InfoExtractor):
3251 IE_DESC = 'YouTube.com playlists'
3252 _VALID_URL = r'''(?x)(?:
3253 (?:https?://)?
3254 (?:\w+\.)?
3255 (?:
3256 (?:
3257 youtube(?:kids)?\.com|
3258 invidio\.us|
3259 youtu\.be
3260 )
3261 /.*?\?.*?\blist=
3262 )?
3263 (?P<id>%(playlist_id)s)
3264 )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
3265 IE_NAME = 'youtube:playlist'
cdc628a4 3266 _TESTS = [{
8bdd16b4 3267 'note': 'issue #673',
3268 'url': 'PLBB231211A4F62143',
cdc628a4 3269 'info_dict': {
8bdd16b4 3270 'title': '[OLD]Team Fortress 2 (Class-based LP)',
3271 'id': 'PLBB231211A4F62143',
3272 'uploader': 'Wickydoo',
3273 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
3274 },
3275 'playlist_mincount': 29,
3276 }, {
3277 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
3278 'info_dict': {
3279 'title': 'YDL_safe_search',
3280 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
3281 },
3282 'playlist_count': 2,
3283 'skip': 'This playlist is private',
9558dcec 3284 }, {
8bdd16b4 3285 'note': 'embedded',
3286 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
3287 'playlist_count': 4,
9558dcec 3288 'info_dict': {
8bdd16b4 3289 'title': 'JODA15',
3290 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
3291 'uploader': 'milan',
3292 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
9558dcec 3293 }
cdc628a4 3294 }, {
8bdd16b4 3295 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
3296 'playlist_mincount': 982,
3297 'info_dict': {
3298 'title': '2018 Chinese New Singles (11/6 updated)',
3299 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
3300 'uploader': 'LBK',
3301 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
3302 }
daa0df9e 3303 }, {
8bdd16b4 3304 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
3305 'info_dict': {
3306 'id': 'yeWKywCrFtk',
3307 'ext': 'mp4',
3308 'title': 'Small Scale Baler and Braiding Rugs',
3309 'uploader': 'Backus-Page House Museum',
3310 'uploader_id': 'backuspagemuseum',
3311 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
3312 'upload_date': '20161008',
3313 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
3314 'categories': ['Nonprofits & Activism'],
3315 'tags': list,
3316 'like_count': int,
3317 'dislike_count': int,
3318 },
3319 'params': {
3320 'noplaylist': True,
3321 'skip_download': True,
3322 },
39e7107d 3323 }, {
8bdd16b4 3324 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
39e7107d 3325 'only_matching': True,
9558dcec 3326 }, {
8bdd16b4 3327 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
9558dcec 3328 'only_matching': True,
73c4ac2c 3329 }, {
8bdd16b4 3330 # music album playlist
3331 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
73c4ac2c 3332 'only_matching': True,
cdc628a4
PH
3333 }]
3334
e3ea4790 3335 @classmethod
f4b05232 3336 def suitable(cls, url):
8bdd16b4 3337 return False if YoutubeTabIE.suitable(url) else super(
3338 YoutubePlaylistIE, cls).suitable(url)
f4b05232 3339
8bdd16b4 3340 def _real_extract(self, url):
3341 playlist_id = self._match_id(url)
3342 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3343 if not qs:
3344 qs = {'list': playlist_id}
3345 return self.url_result(
3346 update_url_query('https://www.youtube.com/playlist', qs),
3347 ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
3348
3349
3350class YoutubeYtUserIE(InfoExtractor):
3351 _VALID_URL = r'ytuser:(?P<id>.+)'
3352 _TESTS = [{
3353 'url': 'ytuser:phihag',
3354 'only_matching': True,
3355 }]
3356
3357 def _real_extract(self, url):
3358 user_id = self._match_id(url)
3359 return self.url_result(
3360 'https://www.youtube.com/user/%s' % user_id,
3361 ie=YoutubeTabIE.ie_key(), video_id=user_id)
9558dcec 3362
b05654f0 3363
8bdd16b4 3364class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):
78caa52a 3365 IE_DESC = 'YouTube.com searches'
b4c08069
JMF
3366 # there doesn't appear to be a real limit, for example if you search for
3367 # 'python' you get more than 8.000.000 results
3368 _MAX_RESULTS = float('inf')
78caa52a 3369 IE_NAME = 'youtube:search'
b05654f0 3370 _SEARCH_KEY = 'ytsearch'
6c894ea1 3371 _SEARCH_PARAMS = None
9dd8e46a 3372 _TESTS = []
b05654f0 3373
6c894ea1
U
3374 def _entries(self, query, n):
3375 data = {
3376 'context': {
3377 'client': {
3378 'clientName': 'WEB',
3379 'clientVersion': '2.20201021.03.00',
3380 }
3381 },
3382 'query': query,
a22b2fd1 3383 }
6c894ea1
U
3384 if self._SEARCH_PARAMS:
3385 data['params'] = self._SEARCH_PARAMS
3386 total = 0
3387 for page_num in itertools.count(1):
3388 search = self._download_json(
3389 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
3390 video_id='query "%s"' % query,
3391 note='Downloading page %s' % page_num,
3392 errnote='Unable to download API page', fatal=False,
3393 data=json.dumps(data).encode('utf8'),
3394 headers={'content-type': 'application/json'})
3395 if not search:
b4c08069 3396 break
6c894ea1
U
3397 slr_contents = try_get(
3398 search,
3399 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
3400 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
3401 list)
3402 if not slr_contents:
a22b2fd1 3403 break
6c894ea1
U
3404 isr_contents = try_get(
3405 slr_contents,
3406 lambda x: x[0]['itemSectionRenderer']['contents'],
3407 list)
3408 if not isr_contents:
3409 break
3410 for content in isr_contents:
3411 if not isinstance(content, dict):
3412 continue
3413 video = content.get('videoRenderer')
3414 if not isinstance(video, dict):
3415 continue
3416 video_id = video.get('videoId')
3417 if not video_id:
3418 continue
3419 title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str)
3420 description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str)
3421 duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str))
3422 view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
3423 view_count = int_or_none(self._search_regex(
3424 r'^(\d+)', re.sub(r'\s', '', view_count_text),
3425 'view count', default=None))
3426 uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
3427 total += 1
3428 yield {
3429 '_type': 'url_transparent',
3430 'ie_key': YoutubeIE.ie_key(),
3431 'id': video_id,
3432 'url': video_id,
3433 'title': title,
3434 'description': description,
3435 'duration': duration,
3436 'view_count': view_count,
3437 'uploader': uploader,
3438 }
3439 if total == n:
3440 return
3441 token = try_get(
3442 slr_contents,
3443 lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
3444 compat_str)
3445 if not token:
3446 break
3447 data['continuation'] = token
b05654f0 3448
6c894ea1
U
3449 def _get_n_results(self, query, n):
3450 """Get a specified number of results for a query"""
3451 return self.playlist_result(self._entries(query, n), query)
75dff0ee 3452
c9ae7b95 3453
a3dd9248 3454class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 3455 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248 3456 _SEARCH_KEY = 'ytsearchdate'
78caa52a 3457 IE_DESC = 'YouTube.com searches, newest videos first'
6c894ea1 3458 _SEARCH_PARAMS = 'CAI%3D'
75dff0ee 3459
c9ae7b95 3460
3462ffa8 3461class YoutubeSearchURLIE(InfoExtractor):
3462 IE_DESC = 'YouTube.com search URLs'
3463 IE_NAME = 'youtube:search_url'
3464 _PARAM_REGEX = r''
3465 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results/?(?:\?|\?[^#]*?&)(?:sp=(?P<param1>[^&#]+)&(?:[^#]*&)?)?(?:q|search_query)=(?P<query>[^#&]+)(?:[^#]*?&sp=(?P<param2>[^#&]+))?'
3466 _MAX_RESULTS = 100
3467 _TESTS = [{
3468 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
3469 'playlist_mincount': 5,
3470 'info_dict': {
3471 'title': 'youtube-dl test video',
3472 }
3473 }, {
3474 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
3475 'only_matching': True,
3476 }]
3477
3478 def _real_extract(self, url):
3479 mobj = re.match(self._VALID_URL, url)
3480 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
3481 IE = YoutubeSearchIE(self._downloader)
3482 IE._SEARCH_PARAMS = mobj.group('param1') or mobj.group('param2')
3483 self._downloader.to_screen(IE._SEARCH_PARAMS)
3484 IE._MAX_RESULTS = self._MAX_RESULTS
3485 return IE._get_n_results(query, self._MAX_RESULTS)
3486
3487
3488class YoutubeFeedsInfoExtractor(YoutubeTabIE):
d7ae0639 3489 """
25f14e9f 3490 Base class for feed extractors
d7ae0639
JMF
3491 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
3492 """
b2e8bc1b 3493 _LOGIN_REQUIRED = True
3462ffa8 3494 _TESTS = []
3495
3496 # _MAX_PAGES = 5
d7ae0639
JMF
3497
3498 @property
3499 def IE_NAME(self):
78caa52a 3500 return 'youtube:%s' % self._FEED_NAME
04cc9617 3501
81f0259b 3502 def _real_initialize(self):
b2e8bc1b 3503 self._login()
81f0259b 3504
3462ffa8 3505 def _shelf_entries(self, shelf_renderer):
3506 renderer = try_get(shelf_renderer, lambda x: x['content']['gridRenderer'], dict)
3507 if not renderer:
3508 return
3509 for entry in self._grid_entries(renderer):
3510 yield entry
8bdd16b4 3511
3462ffa8 3512 def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
3513 selected_tab = self._extract_selected_tab(tabs)
3514 return self.playlist_result(
3515 self._entries(selected_tab['content'], identity_token),
3516 playlist_title=self._PLAYLIST_TITLE)
2bc43303 3517
3853309f 3518 def _real_extract(self, url):
3462ffa8 3519 item_id = self._FEED_NAME
3520 url = 'https://www.youtube.com/feed/%s' % self._FEED_NAME
3521 webpage = self._download_webpage(url, item_id)
3522 identity_token = self._search_regex(
3523 r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
3524 'identity token', default=None)
3525 data = self._extract_yt_initial_data(item_id, webpage)
3526 tabs = try_get(
3527 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
3528 if tabs:
3529 return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token)
3530 # Failed to recognize
3531 raise ExtractorError('Unable to recognize feed page')
25f14e9f
S
3532
3533
3462ffa8 3534class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
3535 IE_DESC = 'Youtube watch later list, ":ytwatchlater" or "WL" for short (requires authentication)'
8bdd16b4 3536 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/watch_later|:ytwatchlater'
3462ffa8 3537 _FEED_NAME = 'watchlater'
25f14e9f 3538
bc7a9cd8 3539 _TESTS = [{
8bdd16b4 3540 'url': 'https://www.youtube.com/feed/watch_later',
bc7a9cd8
S
3541 'only_matching': True,
3542 }, {
8bdd16b4 3543 'url': ':ytwatchlater',
bc7a9cd8
S
3544 'only_matching': True,
3545 }]
25f14e9f
S
3546
3547 def _real_extract(self, url):
3462ffa8 3548 return self.url_result('WL', ie=YoutubePlaylistIE.ie_key())
3549
3550
3551class YoutubeFavouritesIE(YoutubeFeedsInfoExtractor):
3552 IE_DESC = 'YouTube.com liked videos, ":ytfav" or "LL" for short (requires authentication)'
3553 _VALID_URL = r':ytfav(?:ou?rite)s?'
3554 _FEED_NAME = 'favourites'
3555
3556 _TESTS = [{
3557 'url': ':ytfav',
3558 'only_matching': True,
3559 }]
3560
3561 def _real_extract(self, url):
3562 return self.url_result('LL', ie=YoutubePlaylistIE.ie_key())
f459d170 3563
5f6a1245 3564
25f14e9f
S
3565class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3566 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
3462ffa8 3567 _VALID_URL = r'https?://(?:www\.)?youtube\.com(?:/feed/recommended|/?[?#]|/?$)|:ytrec(?:ommended)?'
25f14e9f
S
3568 _FEED_NAME = 'recommended'
3569 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1ed5b5c9 3570
1ed5b5c9 3571
25f14e9f
S
3572class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
3573 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
3462ffa8 3574 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsub(?:scription)?s?'
25f14e9f
S
3575 _FEED_NAME = 'subscriptions'
3576 _PLAYLIST_TITLE = 'Youtube Subscriptions'
1ed5b5c9 3577
1ed5b5c9 3578
25f14e9f
S
3579class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
3580 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
92519402 3581 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
25f14e9f
S
3582 _FEED_NAME = 'history'
3583 _PLAYLIST_TITLE = 'Youtube History'
1ed5b5c9
JMF
3584
3585
15870e90
PH
3586class YoutubeTruncatedURLIE(InfoExtractor):
3587 IE_NAME = 'youtube:truncated_url'
3588 IE_DESC = False # Do not list
975d35db 3589 _VALID_URL = r'''(?x)
b95aab84
PH
3590 (?:https?://)?
3591 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3592 (?:watch\?(?:
c4808c60 3593 feature=[a-z_]+|
b95aab84
PH
3594 annotation_id=annotation_[^&]+|
3595 x-yt-cl=[0-9]+|
c1708b89 3596 hl=[^&]*|
287be8c6 3597 t=[0-9]+
b95aab84
PH
3598 )?
3599 |
3600 attribution_link\?a=[^&]+
3601 )
3602 $
975d35db 3603 '''
15870e90 3604
c4808c60 3605 _TESTS = [{
2d3d2997 3606 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
c4808c60 3607 'only_matching': True,
dc2fc736 3608 }, {
2d3d2997 3609 'url': 'https://www.youtube.com/watch?',
dc2fc736 3610 'only_matching': True,
b95aab84
PH
3611 }, {
3612 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3613 'only_matching': True,
3614 }, {
3615 'url': 'https://www.youtube.com/watch?feature=foo',
3616 'only_matching': True,
c1708b89
PH
3617 }, {
3618 'url': 'https://www.youtube.com/watch?hl=en-GB',
3619 'only_matching': True,
287be8c6
PH
3620 }, {
3621 'url': 'https://www.youtube.com/watch?t=2372',
3622 'only_matching': True,
c4808c60
PH
3623 }]
3624
15870e90
PH
3625 def _real_extract(self, url):
3626 raise ExtractorError(
78caa52a
PH
3627 'Did you forget to quote the URL? Remember that & is a meta '
3628 'character in most shells, so you want to put the URL in quotes, '
3867038a 3629 'like youtube-dl '
2d3d2997 3630 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3867038a 3631 ' or simply youtube-dl BaW_jenozKc .',
15870e90 3632 expected=True)
772fd5cc
PH
3633
3634
3635class YoutubeTruncatedIDIE(InfoExtractor):
3636 IE_NAME = 'youtube:truncated_id'
3637 IE_DESC = False # Do not list
b95aab84 3638 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
3639
3640 _TESTS = [{
3641 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3642 'only_matching': True,
3643 }]
3644
3645 def _real_extract(self, url):
3646 video_id = self._match_id(url)
3647 raise ExtractorError(
3648 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3649 expected=True)
8bdd16b4 3650
3651
3462ffa8 3652# Do Youtube show urls even exist anymore? I couldn't find any
3653r'''
3654class YoutubeShowIE(YoutubeTabIE):
8bdd16b4 3655 IE_DESC = 'YouTube.com (multi-season) shows'
3656 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
3657 IE_NAME = 'youtube:show'
3658 _TESTS = [{
3659 'url': 'https://www.youtube.com/show/airdisasters',
3660 'playlist_mincount': 5,
3661 'info_dict': {
3662 'id': 'airdisasters',
3663 'title': 'Air Disasters',
3664 }
3665 }]
3666
3667 def _real_extract(self, url):
3668 playlist_id = self._match_id(url)
3669 return super(YoutubeShowIE, self)._real_extract(
3670 'https://www.youtube.com/show/%s/playlists' % playlist_id)
3462ffa8 3671'''