]> jfr.im git - yt-dlp.git/blame - youtube_dlc/extractor/youtube.py
Revert changes to vlive
[yt-dlp.git] / youtube_dlc / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
d77ab8e2 9import random
c5e8d7af 10import re
42939b61 11import time
e0df6211 12import traceback
c5e8d7af 13
b05654f0 14from .common import InfoExtractor, SearchInfoExtractor
2b25cb5d 15from ..jsinterp import JSInterpreter
54256267 16from ..swfinterp import SWFInterpreter
4bb4a188 17from ..compat import (
edf3e38e 18 compat_chr,
8d81f3e3 19 compat_kwargs,
c5e8d7af 20 compat_parse_qs,
7fd002c0
S
21 compat_urllib_parse_unquote,
22 compat_urllib_parse_unquote_plus,
15707c7e 23 compat_urllib_parse_urlencode,
7c80519c 24 compat_urllib_parse_urlparse,
7c61bd36 25 compat_urlparse,
c5e8d7af 26 compat_str,
4bb4a188
PH
27)
28from ..utils import (
27019dbb 29 bool_or_none,
c5e8d7af 30 clean_html,
9b9c5355 31 error_to_compat_str,
c5e8d7af 32 ExtractorError,
2d30521a 33 float_or_none,
4bb4a188 34 get_element_by_id,
dd27fd17 35 int_or_none,
94278f72 36 mimetype2ext,
4bb4a188 37 orderedSet,
6310acf5 38 parse_codecs,
b84071c0 39 parse_count,
7c80519c 40 parse_duration,
0cb58b02 41 remove_quotes,
3995d37d 42 remove_start,
cf7e015f 43 smuggle_url,
dbdaaa23 44 str_or_none,
c93d53f5 45 str_to_int,
556dbe7f 46 try_get,
c5e8d7af
PH
47 unescapeHTML,
48 unified_strdate,
cf7e015f 49 unsmuggle_url,
8bdd16b4 50 update_url_query,
81c2f20b 51 uppercase_escape,
21c340b8 52 url_or_none,
6e6bc8da 53 urlencode_postdata,
8bdd16b4 54 urljoin,
c5e8d7af
PH
55)
56
5f6a1245 57
de7f3446 58class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
59 """Provide base functions for Youtube extractors"""
60 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
9303ce3e 61 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
e00eb564
S
62
63 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
3995d37d
S
64 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
65 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
e00eb564 66
b2e8bc1b
JMF
67 _NETRC_MACHINE = 'youtube'
68 # If True it will raise an error if no login info is provided
69 _LOGIN_REQUIRED = False
70
8bdd16b4 71 _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM)'
d0ba5587 72
d84b21b4
S
73 _YOUTUBE_CLIENT_HEADERS = {
74 'x-youtube-client-name': '1',
75 'x-youtube-client-version': '1.20200609.04.02',
76 }
77
b2e8bc1b 78 def _set_language(self):
810fb84d 79 self._set_cookie(
ee0b726c 80 '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
42939b61 81 # YouTube sets the expire time to about two months
810fb84d 82 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b 83
25f14e9f
S
84 def _ids_to_results(self, ids):
85 return [
86 self.url_result(vid_id, 'Youtube', video_id=vid_id)
87 for vid_id in ids]
88
b2e8bc1b 89 def _login(self):
83317f69 90 """
91 Attempt to log in to YouTube.
92 True is returned if successful or skipped.
93 False is returned if login failed.
94
95 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
96 """
68217024 97 username, password = self._get_login_info()
b2e8bc1b
JMF
98 # No authentication to be performed
99 if username is None:
70d35d16 100 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
69ea8ca4 101 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
957c523e
U
102 if self._downloader.params.get('cookiefile') and False: # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
103 self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
83317f69 104 return True
b2e8bc1b 105
7cc3570e
PH
106 login_page = self._download_webpage(
107 self._LOGIN_URL, None,
69ea8ca4
PH
108 note='Downloading login page',
109 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
110 if login_page is False:
111 return
b2e8bc1b 112
1212e997 113 login_form = self._hidden_inputs(login_page)
c5e8d7af 114
e00eb564
S
115 def req(url, f_req, note, errnote):
116 data = login_form.copy()
117 data.update({
118 'pstMsg': 1,
119 'checkConnection': 'youtube',
120 'checkedDomains': 'youtube',
121 'hl': 'en',
122 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
3995d37d 123 'f.req': json.dumps(f_req),
e00eb564
S
124 'flowName': 'GlifWebSignIn',
125 'flowEntry': 'ServiceLogin',
baf67a60
S
126 # TODO: reverse actual botguard identifier generation algo
127 'bgRequest': '["identifier",""]',
041bc3ad 128 })
e00eb564
S
129 return self._download_json(
130 url, None, note=note, errnote=errnote,
131 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
132 fatal=False,
133 data=urlencode_postdata(data), headers={
134 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
135 'Google-Accounts-XSRF': 1,
136 })
137
3995d37d
S
138 def warn(message):
139 self._downloader.report_warning(message)
140
141 lookup_req = [
142 username,
143 None, [], None, 'US', None, None, 2, False, True,
144 [
145 None, None,
146 [2, 1, None, 1,
147 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
148 None, [], 4],
149 1, [None, None, []], None, None, None, True
150 ],
151 username,
152 ]
153
e00eb564 154 lookup_results = req(
3995d37d 155 self._LOOKUP_URL, lookup_req,
e00eb564
S
156 'Looking up account info', 'Unable to look up account info')
157
158 if lookup_results is False:
159 return False
041bc3ad 160
3995d37d
S
161 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
162 if not user_hash:
163 warn('Unable to extract user hash')
164 return False
165
166 challenge_req = [
167 user_hash,
168 None, 1, None, [1, None, None, None, [password, None, True]],
169 [
170 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
171 1, [None, None, []], None, None, None, True
172 ]]
83317f69 173
3995d37d
S
174 challenge_results = req(
175 self._CHALLENGE_URL, challenge_req,
176 'Logging in', 'Unable to log in')
83317f69 177
3995d37d 178 if challenge_results is False:
e00eb564 179 return
83317f69 180
3995d37d
S
181 login_res = try_get(challenge_results, lambda x: x[0][5], list)
182 if login_res:
183 login_msg = try_get(login_res, lambda x: x[5], compat_str)
184 warn(
185 'Unable to login: %s' % 'Invalid password'
186 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
187 return False
188
189 res = try_get(challenge_results, lambda x: x[0][-1], list)
190 if not res:
191 warn('Unable to extract result entry')
192 return False
193
9a6628aa
S
194 login_challenge = try_get(res, lambda x: x[0][0], list)
195 if login_challenge:
196 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
197 if challenge_str == 'TWO_STEP_VERIFICATION':
3995d37d
S
198 # SEND_SUCCESS - TFA code has been successfully sent to phone
199 # QUOTA_EXCEEDED - reached the limit of TFA codes
9a6628aa 200 status = try_get(login_challenge, lambda x: x[5], compat_str)
3995d37d
S
201 if status == 'QUOTA_EXCEEDED':
202 warn('Exceeded the limit of TFA codes, try later')
203 return False
204
205 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
206 if not tl:
207 warn('Unable to extract TL')
208 return False
209
210 tfa_code = self._get_tfa_info('2-step verification code')
211
212 if not tfa_code:
213 warn(
214 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
215 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
216 return False
217
218 tfa_code = remove_start(tfa_code, 'G-')
219
220 tfa_req = [
221 user_hash, None, 2, None,
222 [
223 9, None, None, None, None, None, None, None,
224 [None, tfa_code, True, 2]
225 ]]
226
227 tfa_results = req(
228 self._TFA_URL.format(tl), tfa_req,
229 'Submitting TFA code', 'Unable to submit TFA code')
230
231 if tfa_results is False:
232 return False
233
234 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
235 if tfa_res:
236 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
237 warn(
238 'Unable to finish TFA: %s' % 'Invalid TFA code'
239 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
240 return False
241
242 check_cookie_url = try_get(
243 tfa_results, lambda x: x[0][-1][2], compat_str)
9a6628aa
S
244 else:
245 CHALLENGES = {
246 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
247 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
248 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
249 }
250 challenge = CHALLENGES.get(
251 challenge_str,
252 '%s returned error %s.' % (self.IE_NAME, challenge_str))
253 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
254 return False
3995d37d
S
255 else:
256 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
257
258 if not check_cookie_url:
259 warn('Unable to extract CheckCookie URL')
260 return False
e00eb564
S
261
262 check_cookie_results = self._download_webpage(
3995d37d
S
263 check_cookie_url, None, 'Checking cookie', fatal=False)
264
265 if check_cookie_results is False:
266 return False
e00eb564 267
3995d37d
S
268 if 'https://myaccount.google.com/' not in check_cookie_results:
269 warn('Unable to log in')
b2e8bc1b 270 return False
e00eb564 271
b2e8bc1b
JMF
272 return True
273
30226342 274 def _download_webpage_handle(self, *args, **kwargs):
c1148516 275 query = kwargs.get('query', {}).copy()
c1148516 276 kwargs['query'] = query
30226342 277 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
8d81f3e3
S
278 *args, **compat_kwargs(kwargs))
279
5b0a6a80 280 def _get_yt_initial_data(self, video_id, webpage):
281 config = self._search_regex(
282 (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
283 r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
284 webpage, 'ytInitialData', default=None)
285 if config:
286 return self._parse_json(
287 uppercase_escape(config), video_id, fatal=False)
288
b2e8bc1b
JMF
289 def _real_initialize(self):
290 if self._downloader is None:
291 return
42939b61 292 self._set_language()
b2e8bc1b
JMF
293 if not self._login():
294 return
c5e8d7af 295
8bdd16b4 296 _DEFAULT_API_DATA = {
297 'context': {
298 'client': {
299 'clientName': 'WEB',
300 'clientVersion': '2.20201021.03.00',
301 }
302 },
303 }
8377574c 304
8bdd16b4 305 def _call_api(self, ep, query, video_id):
306 data = self._DEFAULT_API_DATA.copy()
307 data.update(query)
9833e7a0 308
8bdd16b4 309 response = self._download_json(
310 'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id,
311 note='Downloading API JSON', errnote='Unable to download API page',
312 data=json.dumps(data).encode('utf8'),
313 headers={'content-type': 'application/json'},
314 query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'})
c54f4aad 315
8bdd16b4 316 return response
061a75ed 317
8bdd16b4 318 def _extract_yt_initial_data(self, video_id, webpage):
319 return self._parse_json(
320 self._search_regex(
321 r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;',
322 webpage, 'yt initial data'),
323 video_id)
0c148415
S
324
325
360e1ca5 326class YoutubeIE(YoutubeBaseInfoExtractor):
78caa52a 327 IE_DESC = 'YouTube.com'
cb7dfeea 328 _VALID_URL = r"""(?x)^
c5e8d7af 329 (
edb53e2d 330 (?:https?://|//) # http(s):// or protocol-independent URL
66b48727 331 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
484aaeb2 332 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 333 (?:www\.)?pwnyoutube\.com/|
8b561bfc 334 (?:www\.)?hooktube\.com/|
f7000f3a 335 (?:www\.)?yourepeat\.com/|
e69ae5b9 336 tube\.majestyc\.net/|
ba036333 337 # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
77d95677 338 (?:(?:www|dev)\.)?invidio\.us/|
ba036333 339 (?:(?:www|no)\.)?invidiou\.sh/|
340 (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/|
8ae113ca 341 (?:www\.)?invidious\.kabi\.tk/|
ba036333 342 (?:www\.)?invidious\.13ad\.de/|
791d2e81 343 (?:www\.)?invidious\.mastodon\.host/|
494d664e 344 (?:www\.)?invidious\.nixnet\.xyz/|
666d808e 345 (?:www\.)?invidious\.drycat\.fr/|
ba036333 346 (?:www\.)?tube\.poal\.co/|
8ae113ca 347 (?:www\.)?vid\.wxzm\.sx/|
384bf91f 348 (?:www\.)?yewtu\.be/|
494d664e 349 (?:www\.)?yt\.elukerio\.org/|
894b3826 350 (?:www\.)?yt\.lelux\.fi/|
1db5ab6b 351 (?:www\.)?invidious\.ggc-project\.de/|
352 (?:www\.)?yt\.maisputain\.ovh/|
353 (?:www\.)?invidious\.13ad\.de/|
354 (?:www\.)?invidious\.toot\.koeln/|
355 (?:www\.)?invidious\.fdn\.fr/|
356 (?:www\.)?watch\.nettohikari\.com/|
bff90fc5 357 (?:www\.)?kgg2m7yk5aybusll\.onion/|
358 (?:www\.)?qklhadlycap4cnod\.onion/|
359 (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
360 (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
361 (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
362 (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
33c1c7d8 363 (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
1db5ab6b 364 (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
e69ae5b9 365 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
366 (?:.*?\#/)? # handle anchor (#/) redirect urls
367 (?: # the various things that can precede the ID:
ac7553d0 368 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 369 |(?: # or the v= param in all its forms
f7000f3a 370 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af 371 (?:\?|\#!?) # the params delimiter ? or # or #!
040ac686 372 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
c5e8d7af
PH
373 v=
374 )
f4b05232 375 ))
cbaed4bb
S
376 |(?:
377 youtu\.be| # just youtu.be/xxxx
6d4fc66b
S
378 vid\.plus| # or vid.plus/xxxx
379 zwearz\.com/watch| # or zwearz.com/watch/xxxx
cbaed4bb 380 )/
edb53e2d 381 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 382 )
c5e8d7af 383 )? # all until now is optional -> you can pass the naked ID
8bdd16b4 384 (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
d0ba5587
S
385 (?!.*?\blist=
386 (?:
387 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
388 WL # WL are handled by the watch later IE
389 )
390 )
c5e8d7af 391 (?(1).+)? # if we found the ID, everything can follow
d0ba5587 392 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
c5e8d7af 393 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
e40c758c
S
394 _PLAYER_INFO_RE = (
395 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P<ext>[a-z]+)$',
396 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.(?P<ext>[a-z]+)$',
397 )
2c62dc26 398 _formats = {
c2d3cb4c 399 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
400 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
401 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
402 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
403 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
404 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
405 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
406 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
3834d3e3 407 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
c2d3cb4c 408 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
409 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
410 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
411 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
412 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
413 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
e1a0bfdf 414 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
c2d3cb4c 415 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
416 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
e1a0bfdf 417
418
419 # 3D videos
c2d3cb4c 420 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
421 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
422 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
423 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
e1a0bfdf 424 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
425 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
426 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
836a086c 427
96fb5605 428 # Apple HTTP Live Streaming
11f12195 429 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
c2d3cb4c 430 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
431 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
432 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
433 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
434 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
e1a0bfdf 435 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
436 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
2c62dc26
PH
437
438 # DASH mp4 video
d23028a8
S
439 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
440 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
441 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
442 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
443 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
067aa17e 444 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
d23028a8
S
445 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
446 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
447 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
448 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
449 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
450 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
836a086c 451
f6f1fc92 452 # Dash mp4 audio
d23028a8
S
453 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
454 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
455 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
456 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
457 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
458 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
459 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
836a086c
AZ
460
461 # Dash webm
d23028a8
S
462 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
463 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
464 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
465 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
466 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
467 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
468 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
469 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
470 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
471 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
472 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
473 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
474 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
475 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
476 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
4c6b4764 477 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
d23028a8
S
478 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
479 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
480 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
481 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
482 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
483 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
2c62dc26
PH
484
485 # Dash webm audio
d23028a8
S
486 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
487 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
ce6b9a2d 488
0857baad 489 # Dash webm audio with opus inside
d23028a8
S
490 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
491 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
492 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
0857baad 493
ce6b9a2d
PH
494 # RTMP (unnamed)
495 '_rtmp': {'protocol': 'rtmp'},
b85eae0f
S
496
497 # av01 video only formats sometimes served with "unknown" codecs
498 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
499 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
500 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
501 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
c5e8d7af 502 }
84da5d84 503 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
836a086c 504
fd5c4aab
S
505 _GEO_BYPASS = False
506
78caa52a 507 IE_NAME = 'youtube'
2eb88d95
PH
508 _TESTS = [
509 {
2d3d2997 510 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
4bc3a23e
PH
511 'info_dict': {
512 'id': 'BaW_jenozKc',
513 'ext': 'mp4',
3867038a 514 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
4bc3a23e
PH
515 'uploader': 'Philipp Hagemeister',
516 'uploader_id': 'phihag',
ec85ded8 517 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
dd4c4492
S
518 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
519 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
4bc3a23e 520 'upload_date': '20121002',
3867038a 521 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
4bc3a23e 522 'categories': ['Science & Technology'],
3867038a 523 'tags': ['youtube-dl'],
556dbe7f 524 'duration': 10,
dbdaaa23 525 'view_count': int,
3e7c1224
PH
526 'like_count': int,
527 'dislike_count': int,
7c80519c 528 'start_time': 1,
297a564b 529 'end_time': 9,
2eb88d95 530 }
0e853ca4 531 },
fccd3771 532 {
4bc3a23e
PH
533 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
534 'note': 'Embed-only video (#1746)',
535 'info_dict': {
536 'id': 'yZIXLfi8CZQ',
537 'ext': 'mp4',
538 'upload_date': '20120608',
539 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
540 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
541 'uploader': 'SET India',
94bfcd23 542 'uploader_id': 'setindia',
ec85ded8 543 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
94bfcd23 544 'age_limit': 18,
fccd3771
PH
545 }
546 },
11b56058 547 {
8bdd16b4 548 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ',
11b56058
PM
549 'note': 'Use the first video ID in the URL',
550 'info_dict': {
551 'id': 'BaW_jenozKc',
552 'ext': 'mp4',
3867038a 553 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
11b56058
PM
554 'uploader': 'Philipp Hagemeister',
555 'uploader_id': 'phihag',
ec85ded8 556 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
11b56058 557 'upload_date': '20121002',
3867038a 558 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
11b56058 559 'categories': ['Science & Technology'],
3867038a 560 'tags': ['youtube-dl'],
556dbe7f 561 'duration': 10,
dbdaaa23 562 'view_count': int,
11b56058
PM
563 'like_count': int,
564 'dislike_count': int,
34a7de29
S
565 },
566 'params': {
567 'skip_download': True,
568 },
11b56058 569 },
dd27fd17 570 {
2d3d2997 571 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
4bc3a23e
PH
572 'note': '256k DASH audio (format 141) via DASH manifest',
573 'info_dict': {
574 'id': 'a9LDPn-MO4I',
575 'ext': 'm4a',
576 'upload_date': '20121002',
577 'uploader_id': '8KVIDEO',
ec85ded8 578 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
4bc3a23e
PH
579 'description': '',
580 'uploader': '8KVIDEO',
581 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 582 },
4bc3a23e
PH
583 'params': {
584 'youtube_include_dash_manifest': True,
585 'format': '141',
4919603f 586 },
de3c7fe0 587 'skip': 'format 141 not served anymore',
dd27fd17 588 },
8bdd16b4 589 # DASH manifest with encrypted signature
590 {
591 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
592 'info_dict': {
593 'id': 'IB3lcPjvWLA',
594 'ext': 'm4a',
595 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
596 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
597 'duration': 244,
598 'uploader': 'AfrojackVEVO',
599 'uploader_id': 'AfrojackVEVO',
600 'upload_date': '20131011',
601 },
602 'params': {
603 'youtube_include_dash_manifest': True,
604 'format': '141/bestaudio[ext=m4a]',
605 },
606 },
aa79ac0c
PH
607 # Controversy video
608 {
609 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
610 'info_dict': {
611 'id': 'T4XJQO3qol8',
612 'ext': 'mp4',
556dbe7f 613 'duration': 219,
aa79ac0c 614 'upload_date': '20100909',
4fe54c12 615 'uploader': 'Amazing Atheist',
aa79ac0c 616 'uploader_id': 'TheAmazingAtheist',
ec85ded8 617 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
aa79ac0c
PH
618 'title': 'Burning Everyone\'s Koran',
619 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
620 }
c522adb1 621 },
dd2d55f1 622 # Normal age-gate video (embed allowed)
c522adb1 623 {
2d3d2997 624 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
c522adb1
JMF
625 'info_dict': {
626 'id': 'HtVdAasjOgU',
627 'ext': 'mp4',
628 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
ec85ded8 629 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
556dbe7f 630 'duration': 142,
c522adb1
JMF
631 'uploader': 'The Witcher',
632 'uploader_id': 'WitcherGame',
ec85ded8 633 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
c522adb1 634 'upload_date': '20140605',
34952f09 635 'age_limit': 18,
c522adb1
JMF
636 },
637 },
8bdd16b4 638 # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
639 # YouTube Red ad is not captured for creator
640 {
641 'url': '__2ABJjxzNo',
642 'info_dict': {
643 'id': '__2ABJjxzNo',
644 'ext': 'mp4',
645 'duration': 266,
646 'upload_date': '20100430',
647 'uploader_id': 'deadmau5',
648 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
649 'creator': 'Dada Life, deadmau5',
650 'description': 'md5:12c56784b8032162bb936a5f76d55360',
651 'uploader': 'deadmau5',
652 'title': 'Deadmau5 - Some Chords (HD)',
653 'alt_title': 'This Machine Kills Some Chords',
654 },
655 'expected_warnings': [
656 'DASH manifest missing',
657 ]
658 },
067aa17e 659 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
e52a40ab
PH
660 {
661 'url': 'lqQg6PlCWgI',
662 'info_dict': {
663 'id': 'lqQg6PlCWgI',
664 'ext': 'mp4',
556dbe7f 665 'duration': 6085,
90227264 666 'upload_date': '20150827',
cbe2bd91 667 'uploader_id': 'olympic',
ec85ded8 668 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
cbe2bd91 669 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
be49068d 670 'uploader': 'Olympic',
cbe2bd91
PH
671 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
672 },
673 'params': {
674 'skip_download': 'requires avconv',
e52a40ab 675 }
cbe2bd91 676 },
6271f1ca
PH
677 # Non-square pixels
678 {
679 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
680 'info_dict': {
681 'id': '_b-2C3KPAM0',
682 'ext': 'mp4',
683 'stretched_ratio': 16 / 9.,
556dbe7f 684 'duration': 85,
6271f1ca
PH
685 'upload_date': '20110310',
686 'uploader_id': 'AllenMeow',
ec85ded8 687 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
6271f1ca 688 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
eb6793ba 689 'uploader': '孫ᄋᄅ',
6271f1ca
PH
690 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
691 },
06b491eb
S
692 },
693 # url_encoded_fmt_stream_map is empty string
694 {
695 'url': 'qEJwOuvDf7I',
696 'info_dict': {
697 'id': 'qEJwOuvDf7I',
f57b7835 698 'ext': 'webm',
06b491eb
S
699 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
700 'description': '',
701 'upload_date': '20150404',
702 'uploader_id': 'spbelect',
703 'uploader': 'Наблюдатели Петербурга',
704 },
705 'params': {
706 'skip_download': 'requires avconv',
e323cf3f
S
707 },
708 'skip': 'This live event has ended.',
06b491eb 709 },
067aa17e 710 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
da77d856
S
711 {
712 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
713 'info_dict': {
714 'id': 'FIl7x6_3R5Y',
eb6793ba 715 'ext': 'webm',
da77d856
S
716 'title': 'md5:7b81415841e02ecd4313668cde88737a',
717 'description': 'md5:116377fd2963b81ec4ce64b542173306',
556dbe7f 718 'duration': 220,
da77d856
S
719 'upload_date': '20150625',
720 'uploader_id': 'dorappi2000',
ec85ded8 721 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
da77d856 722 'uploader': 'dorappi2000',
eb6793ba 723 'formats': 'mincount:31',
da77d856 724 },
eb6793ba 725 'skip': 'not actual anymore',
2ee8f5d8 726 },
8a1a26ce
YCH
727 # DASH manifest with segment_list
728 {
729 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
730 'md5': '8ce563a1d667b599d21064e982ab9e31',
731 'info_dict': {
732 'id': 'CsmdDsKjzN8',
733 'ext': 'mp4',
17ee98e1 734 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
8a1a26ce
YCH
735 'uploader': 'Airtek',
736 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
737 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
738 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
739 },
740 'params': {
741 'youtube_include_dash_manifest': True,
742 'format': '135', # bestvideo
be49068d
S
743 },
744 'skip': 'This live event has ended.',
2ee8f5d8 745 },
cf7e015f
S
746 {
747 # Multifeed videos (multiple cameras), URL is for Main Camera
748 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
749 'info_dict': {
750 'id': 'jqWvoWXjCVs',
751 'title': 'teamPGP: Rocket League Noob Stream',
752 'description': 'md5:dc7872fb300e143831327f1bae3af010',
753 },
754 'playlist': [{
755 'info_dict': {
756 'id': 'jqWvoWXjCVs',
757 'ext': 'mp4',
758 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
759 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 760 'duration': 7335,
cf7e015f
S
761 'upload_date': '20150721',
762 'uploader': 'Beer Games Beer',
763 'uploader_id': 'beergamesbeer',
ec85ded8 764 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 765 'license': 'Standard YouTube License',
cf7e015f
S
766 },
767 }, {
768 'info_dict': {
769 'id': '6h8e8xoXJzg',
770 'ext': 'mp4',
771 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
772 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 773 'duration': 7337,
cf7e015f
S
774 'upload_date': '20150721',
775 'uploader': 'Beer Games Beer',
776 'uploader_id': 'beergamesbeer',
ec85ded8 777 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 778 'license': 'Standard YouTube License',
cf7e015f
S
779 },
780 }, {
781 'info_dict': {
782 'id': 'PUOgX5z9xZw',
783 'ext': 'mp4',
784 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
785 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 786 'duration': 7337,
cf7e015f
S
787 'upload_date': '20150721',
788 'uploader': 'Beer Games Beer',
789 'uploader_id': 'beergamesbeer',
ec85ded8 790 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 791 'license': 'Standard YouTube License',
cf7e015f
S
792 },
793 }, {
794 'info_dict': {
795 'id': 'teuwxikvS5k',
796 'ext': 'mp4',
797 'title': 'teamPGP: Rocket League Noob Stream (zim)',
798 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 799 'duration': 7334,
cf7e015f
S
800 'upload_date': '20150721',
801 'uploader': 'Beer Games Beer',
802 'uploader_id': 'beergamesbeer',
ec85ded8 803 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 804 'license': 'Standard YouTube License',
cf7e015f
S
805 },
806 }],
807 'params': {
808 'skip_download': True,
809 },
4fe54c12 810 'skip': 'This video is not available.',
cbaed4bb 811 },
f9f49d87 812 {
067aa17e 813 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
f9f49d87
S
814 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
815 'info_dict': {
816 'id': 'gVfLd0zydlo',
817 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
818 },
819 'playlist_count': 2,
be49068d 820 'skip': 'Not multifeed anymore',
f9f49d87 821 },
cbaed4bb 822 {
2d3d2997 823 'url': 'https://vid.plus/FlRa-iH7PGw',
cbaed4bb 824 'only_matching': True,
0e49d9a6 825 },
6d4fc66b 826 {
2d3d2997 827 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
6d4fc66b
S
828 'only_matching': True,
829 },
0e49d9a6 830 {
067aa17e 831 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
a8776b10 832 # Also tests cut-off URL expansion in video description (see
067aa17e
S
833 # https://github.com/ytdl-org/youtube-dl/issues/1892,
834 # https://github.com/ytdl-org/youtube-dl/issues/8164)
0e49d9a6
LL
835 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
836 'info_dict': {
837 'id': 'lsguqyKfVQg',
838 'ext': 'mp4',
839 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
eb6793ba 840 'alt_title': 'Dark Walk - Position Music',
0e49d9a6 841 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
556dbe7f 842 'duration': 133,
0e49d9a6
LL
843 'upload_date': '20151119',
844 'uploader_id': 'IronSoulElf',
ec85ded8 845 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
0e49d9a6 846 'uploader': 'IronSoulElf',
eb6793ba
S
847 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
848 'track': 'Dark Walk - Position Music',
849 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
92bc97d3 850 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
0e49d9a6
LL
851 },
852 'params': {
853 'skip_download': True,
854 },
855 },
61f92af1 856 {
067aa17e 857 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
61f92af1
S
858 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
859 'only_matching': True,
860 },
313dfc45
LL
861 {
862 # Video with yt:stretch=17:0
863 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
864 'info_dict': {
865 'id': 'Q39EVAstoRM',
866 'ext': 'mp4',
867 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
868 'description': 'md5:ee18a25c350637c8faff806845bddee9',
869 'upload_date': '20151107',
870 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
871 'uploader': 'CH GAMER DROID',
872 },
873 'params': {
874 'skip_download': True,
875 },
be49068d 876 'skip': 'This video does not exist.',
313dfc45 877 },
7caf9830
S
878 {
879 # Video licensed under Creative Commons
880 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
881 'info_dict': {
882 'id': 'M4gD1WSo5mA',
883 'ext': 'mp4',
884 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
885 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
556dbe7f 886 'duration': 721,
7caf9830
S
887 'upload_date': '20150127',
888 'uploader_id': 'BerkmanCenter',
ec85ded8 889 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
556dbe7f 890 'uploader': 'The Berkman Klein Center for Internet & Society',
7caf9830
S
891 'license': 'Creative Commons Attribution license (reuse allowed)',
892 },
893 'params': {
894 'skip_download': True,
895 },
896 },
fd050249
S
897 {
898 # Channel-like uploader_url
899 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
900 'info_dict': {
901 'id': 'eQcmzGIKrzg',
902 'ext': 'mp4',
903 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
904 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
556dbe7f 905 'duration': 4060,
fd050249 906 'upload_date': '20151119',
eb6793ba 907 'uploader': 'Bernie Sanders',
fd050249 908 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
ec85ded8 909 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
fd050249
S
910 'license': 'Creative Commons Attribution license (reuse allowed)',
911 },
912 'params': {
913 'skip_download': True,
914 },
915 },
040ac686
S
916 {
917 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
918 'only_matching': True,
7f29cf54
S
919 },
920 {
067aa17e 921 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
7f29cf54
S
922 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
923 'only_matching': True,
6496ccb4
S
924 },
925 {
926 # Rental video preview
927 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
928 'info_dict': {
929 'id': 'uGpuVWrhIzE',
930 'ext': 'mp4',
931 'title': 'Piku - Trailer',
932 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
933 'upload_date': '20150811',
934 'uploader': 'FlixMatrix',
935 'uploader_id': 'FlixMatrixKaravan',
ec85ded8 936 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
6496ccb4
S
937 'license': 'Standard YouTube License',
938 },
939 'params': {
940 'skip_download': True,
941 },
eb6793ba 942 'skip': 'This video is not available.',
022a5d66 943 },
12afdc2a
S
944 {
945 # YouTube Red video with episode data
946 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
947 'info_dict': {
948 'id': 'iqKdEhx-dD4',
949 'ext': 'mp4',
950 'title': 'Isolation - Mind Field (Ep 1)',
4fe54c12 951 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
556dbe7f 952 'duration': 2085,
12afdc2a
S
953 'upload_date': '20170118',
954 'uploader': 'Vsauce',
955 'uploader_id': 'Vsauce',
956 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
12afdc2a
S
957 'series': 'Mind Field',
958 'season_number': 1,
959 'episode_number': 1,
960 },
961 'params': {
962 'skip_download': True,
963 },
964 'expected_warnings': [
965 'Skipping DASH manifest',
966 ],
967 },
c7121fa7
S
968 {
969 # The following content has been identified by the YouTube community
970 # as inappropriate or offensive to some audiences.
971 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
972 'info_dict': {
973 'id': '6SJNVb0GnPI',
974 'ext': 'mp4',
975 'title': 'Race Differences in Intelligence',
976 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
977 'duration': 965,
978 'upload_date': '20140124',
979 'uploader': 'New Century Foundation',
980 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
981 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
c7121fa7
S
982 },
983 'params': {
984 'skip_download': True,
985 },
986 },
022a5d66
S
987 {
988 # itag 212
989 'url': '1t24XAntNCY',
990 'only_matching': True,
fd5c4aab
S
991 },
992 {
993 # geo restricted to JP
994 'url': 'sJL6WA-aGkQ',
995 'only_matching': True,
996 },
cd5a74a2
S
997 {
998 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
999 'only_matching': True,
1000 },
825cd268
RA
1001 {
1002 # DRM protected
1003 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1004 'only_matching': True,
4fe54c12
S
1005 },
1006 {
1007 # Video with unsupported adaptive stream type formats
1008 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1009 'info_dict': {
1010 'id': 'Z4Vy8R84T1U',
1011 'ext': 'mp4',
1012 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1013 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1014 'duration': 433,
1015 'upload_date': '20130923',
1016 'uploader': 'Amelia Putri Harwita',
1017 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1018 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1019 'formats': 'maxcount:10',
1020 },
1021 'params': {
1022 'skip_download': True,
1023 'youtube_include_dash_manifest': False,
1024 },
5429d6a9 1025 'skip': 'not actual anymore',
5caabd3c 1026 },
1027 {
822b9d9c 1028 # Youtube Music Auto-generated description
5caabd3c 1029 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1030 'info_dict': {
1031 'id': 'MgNrAu2pzNs',
1032 'ext': 'mp4',
1033 'title': 'Voyeur Girl',
1034 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1035 'upload_date': '20190312',
5429d6a9
S
1036 'uploader': 'Stephen - Topic',
1037 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
5caabd3c 1038 'artist': 'Stephen',
1039 'track': 'Voyeur Girl',
1040 'album': 'it\'s too much love to know my dear',
1041 'release_date': '20190313',
1042 'release_year': 2019,
1043 },
1044 'params': {
1045 'skip_download': True,
1046 },
1047 },
66b48727
RA
1048 {
1049 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1050 'only_matching': True,
1051 },
011e75e6
S
1052 {
1053 # invalid -> valid video id redirection
1054 'url': 'DJztXj2GPfl',
1055 'info_dict': {
1056 'id': 'DJztXj2GPfk',
1057 'ext': 'mp4',
1058 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1059 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1060 'upload_date': '20090125',
1061 'uploader': 'Prochorowka',
1062 'uploader_id': 'Prochorowka',
1063 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1064 'artist': 'Panjabi MC',
1065 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1066 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1067 },
1068 'params': {
1069 'skip_download': True,
1070 },
ea74e00b
DP
1071 },
1072 {
1073 # empty description results in an empty string
1074 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1075 'info_dict': {
1076 'id': 'x41yOUIvK2k',
1077 'ext': 'mp4',
1078 'title': 'IMG 3456',
1079 'description': '',
1080 'upload_date': '20170613',
1081 'uploader_id': 'ElevageOrVert',
1082 'uploader': 'ElevageOrVert',
1083 },
1084 'params': {
1085 'skip_download': True,
1086 },
1087 },
2eb88d95
PH
1088 ]
1089
e0df6211
PH
1090 def __init__(self, *args, **kwargs):
1091 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 1092 self._player_cache = {}
e0df6211 1093
c5e8d7af
PH
1094 def report_video_info_webpage_download(self, video_id):
1095 """Report attempt to download video info webpage."""
69ea8ca4 1096 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 1097
c5e8d7af
PH
1098 def report_information_extraction(self, video_id):
1099 """Report attempt to extract video information."""
69ea8ca4 1100 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
1101
1102 def report_unavailable_format(self, video_id, format):
1103 """Report extracted video URL."""
69ea8ca4 1104 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
1105
1106 def report_rtmp_download(self):
1107 """Indicate the download will use the RTMP protocol."""
69ea8ca4 1108 self.to_screen('RTMP download detected')
c5e8d7af 1109
60064c53
PH
1110 def _signature_cache_id(self, example_sig):
1111 """ Return a string representation of a signature """
78caa52a 1112 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53 1113
e40c758c
S
1114 @classmethod
1115 def _extract_player_info(cls, player_url):
1116 for player_re in cls._PLAYER_INFO_RE:
1117 id_m = re.search(player_re, player_url)
1118 if id_m:
1119 break
1120 else:
c081b35c 1121 raise ExtractorError('Cannot identify player %r' % player_url)
e40c758c
S
1122 return id_m.group('ext'), id_m.group('id')
1123
1124 def _extract_signature_function(self, video_id, player_url, example_sig):
1125 player_type, player_id = self._extract_player_info(player_url)
e0df6211 1126
c4417ddb 1127 # Read from filesystem cache
60064c53
PH
1128 func_id = '%s_%s_%s' % (
1129 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 1130 assert os.path.basename(func_id) == func_id
a0e07d31 1131
69ea8ca4 1132 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 1133 if cache_spec is not None:
78caa52a 1134 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 1135
6d1a55a5
PH
1136 download_note = (
1137 'Downloading player %s' % player_url
1138 if self._downloader.params.get('verbose') else
1139 'Downloading %s player %s' % (player_type, player_id)
1140 )
e0df6211
PH
1141 if player_type == 'js':
1142 code = self._download_webpage(
1143 player_url, video_id,
6d1a55a5 1144 note=download_note,
69ea8ca4 1145 errnote='Download of %s failed' % player_url)
83799698 1146 res = self._parse_sig_js(code)
c4417ddb 1147 elif player_type == 'swf':
e0df6211
PH
1148 urlh = self._request_webpage(
1149 player_url, video_id,
6d1a55a5 1150 note=download_note,
69ea8ca4 1151 errnote='Download of %s failed' % player_url)
e0df6211 1152 code = urlh.read()
83799698 1153 res = self._parse_sig_swf(code)
e0df6211
PH
1154 else:
1155 assert False, 'Invalid player type %r' % player_type
1156
785521bf
PH
1157 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1158 cache_res = res(test_string)
1159 cache_spec = [ord(c) for c in cache_res]
83799698 1160
69ea8ca4 1161 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
1162 return res
1163
60064c53 1164 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
1165 def gen_sig_code(idxs):
1166 def _genslice(start, end, step):
78caa52a 1167 starts = '' if start == 0 else str(start)
8bcc8756 1168 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 1169 steps = '' if step == 1 else (':%d' % step)
78caa52a 1170 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
1171
1172 step = None
7af808a5
PH
1173 # Quelch pyflakes warnings - start will be set when step is set
1174 start = '(Never used)'
edf3e38e
PH
1175 for i, prev in zip(idxs[1:], idxs[:-1]):
1176 if step is not None:
1177 if i - prev == step:
1178 continue
1179 yield _genslice(start, prev, step)
1180 step = None
1181 continue
1182 if i - prev in [-1, 1]:
1183 step = i - prev
1184 start = prev
1185 continue
1186 else:
78caa52a 1187 yield 's[%d]' % prev
edf3e38e 1188 if step is None:
78caa52a 1189 yield 's[%d]' % i
edf3e38e
PH
1190 else:
1191 yield _genslice(start, i, step)
1192
78caa52a 1193 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 1194 cache_res = func(test_string)
edf3e38e 1195 cache_spec = [ord(c) for c in cache_res]
78caa52a 1196 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
1197 signature_id_tuple = '(%s)' % (
1198 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 1199 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 1200 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 1201 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 1202
e0df6211
PH
1203 def _parse_sig_js(self, jscode):
1204 funcname = self._search_regex(
abefc03f
S
1205 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1206 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
e450f6cb 1207 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
31ce6e99 1208 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
abefc03f
S
1209 # Obsolete patterns
1210 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
9a47fa35 1211 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
abefc03f
S
1212 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1213 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1214 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1215 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1216 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1217 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
3c90cc8b 1218 jscode, 'Initial JS player signature function name', group='sig')
2b25cb5d
PH
1219
1220 jsi = JSInterpreter(jscode)
1221 initial_function = jsi.extract_function(funcname)
e0df6211
PH
1222 return lambda s: initial_function([s])
1223
1224 def _parse_sig_swf(self, file_contents):
54256267 1225 swfi = SWFInterpreter(file_contents)
78caa52a 1226 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 1227 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 1228 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
1229 return lambda s: initial_function([s])
1230
83799698 1231 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 1232 """Turn the encrypted s field into a working signature"""
6b37f0be 1233
c8bf86d5 1234 if player_url is None:
69ea8ca4 1235 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 1236
69ea8ca4 1237 if player_url.startswith('//'):
78caa52a 1238 player_url = 'https:' + player_url
3c90cc8b
S
1239 elif not re.match(r'https?://', player_url):
1240 player_url = compat_urlparse.urljoin(
1241 'https://www.youtube.com', player_url)
c8bf86d5 1242 try:
62af3a0e 1243 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
1244 if player_id not in self._player_cache:
1245 func = self._extract_signature_function(
60064c53 1246 video_id, player_url, s
c8bf86d5
PH
1247 )
1248 self._player_cache[player_id] = func
1249 func = self._player_cache[player_id]
1250 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 1251 self._print_sig_code(func, s)
c8bf86d5
PH
1252 return func(s)
1253 except Exception as e:
1254 tb = traceback.format_exc()
1255 raise ExtractorError(
78caa52a 1256 'Signature extraction failed: ' + tb, cause=e)
e0df6211 1257
f96f5dda 1258 def _get_subtitles(self, video_id, webpage, has_live_chat_replay):
de7f3446 1259 try:
60e47a26 1260 subs_doc = self._download_xml(
38c2e5b8 1261 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
1262 video_id, note=False)
1263 except ExtractorError as err:
9b9c5355 1264 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
de7f3446 1265 return {}
de7f3446
JMF
1266
1267 sub_lang_list = {}
60e47a26
JMF
1268 for track in subs_doc.findall('track'):
1269 lang = track.attrib['lang_code']
7e660ac1
LD
1270 if lang in sub_lang_list:
1271 continue
360e1ca5 1272 sub_formats = []
23d17e4b 1273 for ext in self._SUBTITLE_FORMATS:
15707c7e 1274 params = compat_urllib_parse_urlencode({
360e1ca5
JMF
1275 'lang': lang,
1276 'v': video_id,
1277 'fmt': ext,
1278 'name': track.attrib['name'].encode('utf-8'),
1279 })
1280 sub_formats.append({
1281 'url': 'https://www.youtube.com/api/timedtext?' + params,
1282 'ext': ext,
1283 })
1284 sub_lang_list[lang] = sub_formats
9f448fcb 1285 if has_live_chat_replay:
321bf820 1286 sub_lang_list['live_chat'] = [
1287 {
1288 'video_id': video_id,
1289 'ext': 'json',
1290 'protocol': 'youtube_live_chat_replay',
1291 },
9f448fcb 1292 ]
de7f3446 1293 if not sub_lang_list:
69ea8ca4 1294 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
1295 return {}
1296 return sub_lang_list
1297
a72778d3
S
1298 def _get_ytplayer_config(self, video_id, webpage):
1299 patterns = (
526b3b07
S
1300 # User data may contain arbitrary character sequences that may affect
1301 # JSON extraction with regex, e.g. when '};' is contained the second
1302 # regex won't capture the whole JSON. Yet working around by trying more
1303 # concrete regex first keeping in mind proper quoted string handling
1304 # to be implemented in future that will replace this workaround (see
067aa17e
S
1305 # https://github.com/ytdl-org/youtube-dl/issues/7468,
1306 # https://github.com/ytdl-org/youtube-dl/pull/7599)
a72778d3
S
1307 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1308 r';ytplayer\.config\s*=\s*({.+?});',
8bdd16b4 1309 r'ytInitialPlayerResponse\s*=\s*({.+?});var meta' # Needed???
a72778d3
S
1310 )
1311 config = self._search_regex(
1312 patterns, webpage, 'ytplayer.config', default=None)
1313 if config:
1314 return self._parse_json(
1315 uppercase_escape(config), video_id, fatal=False)
0e49d9a6 1316
9322f116 1317 def _get_music_metadata_from_yt_initial(self, yt_initial):
1318 music_metadata = []
1319 key_map = {
1320 'Album': 'album',
1321 'Artist': 'artist',
1322 'Song': 'track'
1323 }
1324 contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'])
1325 if type(contents) is list:
1326 for content in contents:
1327 music_track = {}
1328 if type(content) is not dict:
1329 continue
1330 videoSecondaryInfoRenderer = try_get(content, lambda x: x['videoSecondaryInfoRenderer'])
1331 if type(videoSecondaryInfoRenderer) is not dict:
1332 continue
1333 rows = try_get(videoSecondaryInfoRenderer, lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'])
1334 if type(rows) is not list:
1335 continue
1336 for row in rows:
1337 metadataRowRenderer = try_get(row, lambda x: x['metadataRowRenderer'])
1338 if type(metadataRowRenderer) is not dict:
1339 continue
1340 key = try_get(metadataRowRenderer, lambda x: x['title']['simpleText'])
1341 value = try_get(metadataRowRenderer, lambda x: x['contents'][0]['simpleText']) or \
1342 try_get(metadataRowRenderer, lambda x: x['contents'][0]['runs'][0]['text'])
1343 if type(key) is not str or type(value) is not str:
1344 continue
1345 if key in key_map:
1346 if key_map[key] in music_track:
1347 # we've started on a new track
1348 music_metadata.append(music_track)
1349 music_track = {}
1350 music_track[key_map[key]] = value
1351 if len(music_track.keys()):
1352 music_metadata.append(music_track)
1353 return music_metadata
1354
360e1ca5 1355 def _get_automatic_captions(self, video_id, webpage):
de7f3446
JMF
1356 """We need the webpage for getting the captions url, pass it as an
1357 argument to speed up the process."""
69ea8ca4 1358 self.to_screen('%s: Looking for automatic captions' % video_id)
a72778d3 1359 player_config = self._get_ytplayer_config(video_id, webpage)
78caa52a 1360 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
a72778d3 1361 if not player_config:
de7f3446
JMF
1362 self._downloader.report_warning(err_msg)
1363 return {}
de7f3446 1364 try:
8bdd16b4 1365 args = player_config['args']
1366 caption_url = args.get('ttsurl')
1367 if caption_url:
b78b292f
S
1368 timestamp = args['timestamp']
1369 # We get the available subtitles
15707c7e 1370 list_params = compat_urllib_parse_urlencode({
b78b292f
S
1371 'type': 'list',
1372 'tlangs': 1,
1373 'asrs': 1,
1374 })
1375 list_url = caption_url + '&' + list_params
1376 caption_list = self._download_xml(list_url, video_id)
1377 original_lang_node = caption_list.find('track')
1378 if original_lang_node is None:
1379 self._downloader.report_warning('Video doesn\'t have automatic captions')
1380 return {}
1381 original_lang = original_lang_node.attrib['lang_code']
1382 caption_kind = original_lang_node.attrib.get('kind', '')
1383
1384 sub_lang_list = {}
1385 for lang_node in caption_list.findall('target'):
1386 sub_lang = lang_node.attrib['lang_code']
1387 sub_formats = []
1388 for ext in self._SUBTITLE_FORMATS:
15707c7e 1389 params = compat_urllib_parse_urlencode({
b78b292f
S
1390 'lang': original_lang,
1391 'tlang': sub_lang,
1392 'fmt': ext,
1393 'ts': timestamp,
1394 'kind': caption_kind,
1395 })
1396 sub_formats.append({
1397 'url': caption_url + '&' + params,
1398 'ext': ext,
1399 })
1400 sub_lang_list[sub_lang] = sub_formats
1401 return sub_lang_list
1402
ddbb4c5c
S
1403 def make_captions(sub_url, sub_langs):
1404 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1405 caption_qs = compat_parse_qs(parsed_sub_url.query)
1406 captions = {}
1407 for sub_lang in sub_langs:
1408 sub_formats = []
1409 for ext in self._SUBTITLE_FORMATS:
1410 caption_qs.update({
1411 'tlang': [sub_lang],
1412 'fmt': [ext],
1413 })
1414 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1415 query=compat_urllib_parse_urlencode(caption_qs, True)))
1416 sub_formats.append({
1417 'url': sub_url,
1418 'ext': ext,
1419 })
1420 captions[sub_lang] = sub_formats
1421 return captions
1422
1423 # New captions format as of 22.06.2017
8bdd16b4 1424 player_response = args.get('player_response')
1425 if player_response and isinstance(player_response, compat_str):
1426 player_response = self._parse_json(
1427 player_response, video_id, fatal=False)
1428 if player_response:
1429 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1430 base_url = renderer['captionTracks'][0]['baseUrl']
59c5fa91
PO
1431 sub_lang_list = []
1432 for lang in renderer['translationLanguages']:
1433 lang_code = lang.get('languageCode')
1434 if lang_code:
1435 sub_lang_list.append(lang_code)
1436 return make_captions(base_url, sub_lang_list)
1437
8bdd16b4 1438 # Some videos don't provide ttsurl but rather caption_tracks and
1439 # caption_translation_languages (e.g. 20LmZk1hakA)
1440 # Does not used anymore as of 22.06.2017
1441 caption_tracks = args['caption_tracks']
1442 caption_translation_languages = args['caption_translation_languages']
1443 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
1444 sub_lang_list = []
1445 for lang in caption_translation_languages.split(','):
1446 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1447 sub_lang = lang_qs.get('lc', [None])[0]
1448 if sub_lang:
1449 sub_lang_list.append(sub_lang)
1450 return make_captions(caption_url, sub_lang_list)
de7f3446
JMF
1451 # An extractor error can be raise by the download process if there are
1452 # no automatic captions but there are subtitles
ddbb4c5c 1453 except (KeyError, IndexError, ExtractorError):
de7f3446
JMF
1454 self._downloader.report_warning(err_msg)
1455 return {}
1456
21c340b8
S
1457 def _mark_watched(self, video_id, video_info, player_response):
1458 playback_url = url_or_none(try_get(
1459 player_response,
1460 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
1461 video_info, lambda x: x['videostats_playback_base_url'][0]))
d77ab8e2
S
1462 if not playback_url:
1463 return
1464 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1465 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1466
1467 # cpn generation algorithm is reverse engineered from base.js.
1468 # In fact it works even with dummy cpn.
1469 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1470 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1471
1472 qs.update({
1473 'ver': ['2'],
1474 'cpn': [cpn],
1475 })
1476 playback_url = compat_urlparse.urlunparse(
15707c7e 1477 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
d77ab8e2
S
1478
1479 self._download_webpage(
1480 playback_url, video_id, 'Marking watched',
1481 'Unable to mark watched', fatal=False)
1482
66c9fa36
S
1483 @staticmethod
1484 def _extract_urls(webpage):
1485 # Embedded YouTube player
1486 entries = [
1487 unescapeHTML(mobj.group('url'))
1488 for mobj in re.finditer(r'''(?x)
1489 (?:
1490 <iframe[^>]+?src=|
1491 data-video-url=|
1492 <embed[^>]+?src=|
1493 embedSWF\(?:\s*|
1494 <object[^>]+data=|
1495 new\s+SWFObject\(
1496 )
1497 (["\'])
1498 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
f2332f18 1499 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
66c9fa36
S
1500 \1''', webpage)]
1501
1502 # lazyYT YouTube embed
1503 entries.extend(list(map(
1504 unescapeHTML,
1505 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1506
1507 # Wordpress "YouTube Video Importer" plugin
1508 matches = re.findall(r'''(?x)<div[^>]+
1509 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1510 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1511 entries.extend(m[-1] for m in matches)
1512
1513 return entries
1514
1515 @staticmethod
1516 def _extract_url(webpage):
1517 urls = YoutubeIE._extract_urls(webpage)
1518 return urls[0] if urls else None
1519
97665381
PH
1520 @classmethod
1521 def extract_id(cls, url):
1522 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 1523 if mobj is None:
69ea8ca4 1524 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
1525 video_id = mobj.group(2)
1526 return video_id
1527
84213ea8
S
1528 def _extract_chapters_from_json(self, webpage, video_id, duration):
1529 if not webpage:
1530 return
8bdd16b4 1531 data = self._extract_yt_initial_data(video_id, webpage)
1532 if not data or not isinstance(data, dict):
84213ea8
S
1533 return
1534 chapters_list = try_get(
8bdd16b4 1535 data,
84213ea8
S
1536 lambda x: x['playerOverlays']
1537 ['playerOverlayRenderer']
1538 ['decoratedPlayerBarRenderer']
1539 ['decoratedPlayerBarRenderer']
1540 ['playerBar']
1541 ['chapteredPlayerBarRenderer']
1542 ['chapters'],
1543 list)
1544 if not chapters_list:
1545 return
1546
1547 def chapter_time(chapter):
1548 return float_or_none(
1549 try_get(
1550 chapter,
1551 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1552 int),
1553 scale=1000)
1554 chapters = []
1555 for next_num, chapter in enumerate(chapters_list, start=1):
1556 start_time = chapter_time(chapter)
1557 if start_time is None:
1558 continue
1559 end_time = (chapter_time(chapters_list[next_num])
1560 if next_num < len(chapters_list) else duration)
1561 if end_time is None:
1562 continue
1563 title = try_get(
1564 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1565 compat_str)
1566 chapters.append({
1567 'start_time': start_time,
1568 'end_time': end_time,
1569 'title': title,
1570 })
1571 return chapters
1572
9cafc3fd 1573 @staticmethod
84213ea8 1574 def _extract_chapters_from_description(description, duration):
9cafc3fd
S
1575 if not description:
1576 return None
1577 chapter_lines = re.findall(
1578 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1579 description)
1580 if not chapter_lines:
1581 return None
1582 chapters = []
1583 for next_num, (chapter_line, time_point) in enumerate(
1584 chapter_lines, start=1):
1585 start_time = parse_duration(time_point)
1586 if start_time is None:
1587 continue
39d4c1be
S
1588 if start_time > duration:
1589 break
9cafc3fd
S
1590 end_time = (duration if next_num == len(chapter_lines)
1591 else parse_duration(chapter_lines[next_num][1]))
1592 if end_time is None:
1593 continue
39d4c1be
S
1594 if end_time > duration:
1595 end_time = duration
1596 if start_time > end_time:
1597 break
9cafc3fd
S
1598 chapter_title = re.sub(
1599 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1600 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1601 chapters.append({
1602 'start_time': start_time,
1603 'end_time': end_time,
1604 'title': chapter_title,
1605 })
1606 return chapters
1607
84213ea8
S
1608 def _extract_chapters(self, webpage, description, video_id, duration):
1609 return (self._extract_chapters_from_json(webpage, video_id, duration)
1610 or self._extract_chapters_from_description(description, duration))
1611
c5e8d7af 1612 def _real_extract(self, url):
cf7e015f
S
1613 url, smuggled_data = unsmuggle_url(url, {})
1614
7e8c0af0 1615 proto = (
78caa52a
PH
1616 'http' if self._downloader.params.get('prefer_insecure', False)
1617 else 'https')
7e8c0af0 1618
7c80519c 1619 start_time = None
297a564b 1620 end_time = None
7c80519c
JMF
1621 parsed_url = compat_urllib_parse_urlparse(url)
1622 for component in [parsed_url.fragment, parsed_url.query]:
1623 query = compat_parse_qs(component)
297a564b 1624 if start_time is None and 't' in query:
7c80519c 1625 start_time = parse_duration(query['t'][0])
2929fa0e
JMF
1626 if start_time is None and 'start' in query:
1627 start_time = parse_duration(query['start'][0])
297a564b
JMF
1628 if end_time is None and 'end' in query:
1629 end_time = parse_duration(query['end'][0])
7c80519c 1630
c5e8d7af
PH
1631 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1632 mobj = re.search(self._NEXT_URL_RE, url)
1633 if mobj:
7fd002c0 1634 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
97665381 1635 video_id = self.extract_id(url)
c5e8d7af
PH
1636
1637 # Get video webpage
aa79ac0c 1638 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
011e75e6
S
1639 video_webpage, urlh = self._download_webpage_handle(url, video_id)
1640
1641 qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query)
1642 video_id = qs.get('v', [None])[0] or video_id
c5e8d7af
PH
1643
1644 # Attempt to extract SWF player URL
e0df6211 1645 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1646 if mobj is not None:
1647 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1648 else:
1649 player_url = None
1650
d8d24a92
S
1651 dash_mpds = []
1652
1653 def add_dash_mpd(video_info):
1654 dash_mpd = video_info.get('dashmpd')
1655 if dash_mpd and dash_mpd[0] not in dash_mpds:
1656 dash_mpds.append(dash_mpd[0])
1657
561b456e
S
1658 def add_dash_mpd_pr(pl_response):
1659 dash_mpd = url_or_none(try_get(
1660 pl_response, lambda x: x['streamingData']['dashManifestUrl'],
1661 compat_str))
1662 if dash_mpd and dash_mpd not in dash_mpds:
1663 dash_mpds.append(dash_mpd)
1664
c7121fa7
S
1665 is_live = None
1666 view_count = None
1667
1668 def extract_view_count(v_info):
1669 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1670
c2d125d9
S
1671 def extract_player_response(player_response, video_id):
1672 pl_response = str_or_none(player_response)
1673 if not pl_response:
1674 return
1675 pl_response = self._parse_json(pl_response, video_id, fatal=False)
1676 if isinstance(pl_response, dict):
1677 add_dash_mpd_pr(pl_response)
1678 return pl_response
1679
fb2c9277
U
1680 def extract_embedded_config(embed_webpage, video_id):
1681 embedded_config = self._search_regex(
1682 r'setConfig\(({.*})\);',
1683 embed_webpage, 'ytInitialData', default=None)
1684 if embedded_config:
1685 return embedded_config
1686
dbdaaa23
S
1687 player_response = {}
1688
c5e8d7af 1689 # Get video info
43ebf77d 1690 video_info = {}
6449cd80 1691 embed_webpage = None
39e7107d
U
1692 if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+'
1693 or re.search(r'player-age-gate-content">', video_webpage) is not None):
9d9314cb 1694 cookie_keys = self._get_cookies('https://www.youtube.com').keys()
c108eb73
JMF
1695 age_gate = True
1696 # We simulate the access to the video from www.youtube.com/v/{video_id}
1697 # this can be viewed without login into Youtube
beb95e77
CL
1698 url = proto + '://www.youtube.com/embed/%s' % video_id
1699 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
fb2c9277
U
1700 ext = extract_embedded_config(embed_webpage, video_id)
1701 # playabilityStatus = re.search(r'{\\\"status\\\":\\\"(?P<playabilityStatus>[^\"]+)\\\"', ext)
1702 playable_in_embed = re.search(r'{\\\"playableInEmbed\\\":(?P<playableinEmbed>[^\,]+)', ext)
1703 if not playable_in_embed:
1704 self.to_screen('Could not determine whether playabale in embed for video %s' % video_id)
1705 playable_in_embed = ''
1706 else:
1707 playable_in_embed = playable_in_embed.group('playableinEmbed')
1708 # check if video is only playable on youtube in other words not playable in embed - if so it requires auth (cookies)
1709 # if re.search(r'player-unavailable">', embed_webpage) is not None:
1710 if playable_in_embed == 'false':
c73baf23
U
1711 '''
1712 # TODO apply this patch when Support for Python 2.6(!) and above drops
9d9314cb 1713 if ({'VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID'} <= cookie_keys
4bb9c880 1714 or {'VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO'} <= cookie_keys):
c73baf23
U
1715 '''
1716 if (set(('VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID')) <= set(cookie_keys)
1717 or set(('VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO')) <= set(cookie_keys)):
4bb9c880
U
1718 age_gate = False
1719 # Try looking directly into the video webpage
1720 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1721 if ytplayer_config:
59c5fa91
PO
1722 args = ytplayer_config.get("args")
1723 if args is not None:
1724 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1725 # Convert to the same format returned by compat_parse_qs
1726 video_info = dict((k, [v]) for k, v in args.items())
1727 add_dash_mpd(video_info)
1728 # Rental video is not rented but preview is available (e.g.
1729 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1730 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1731 if not video_info and args.get('ypc_vid'):
1732 return self.url_result(
1733 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1734 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1735 is_live = True
1736 if not player_response:
1737 player_response = extract_player_response(args.get('player_response'), video_id)
1738 elif not player_response:
1739 player_response = ytplayer_config
4bb9c880
U
1740 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1741 add_dash_mpd_pr(player_response)
9d9314cb
U
1742 else:
1743 raise ExtractorError('Video is age restricted and only playable on Youtube. Requires cookies!', expected=True)
1744 else:
1745 data = compat_urllib_parse_urlencode({
1746 'video_id': video_id,
1747 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1748 'sts': self._search_regex(
1749 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1750 })
1751 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1752 try:
1753 video_info_webpage = self._download_webpage(
1754 video_info_url, video_id,
1755 note='Refetching age-gated info webpage',
1756 errnote='unable to download video info webpage')
1757 except ExtractorError:
1758 video_info_webpage = None
1759 if video_info_webpage:
1760 video_info = compat_parse_qs(video_info_webpage)
1761 pl_response = video_info.get('player_response', [None])[0]
1762 player_response = extract_player_response(pl_response, video_id)
1763 add_dash_mpd(video_info)
1764 view_count = extract_view_count(video_info)
c108eb73
JMF
1765 else:
1766 age_gate = False
d8d24a92 1767 # Try looking directly into the video webpage
a72778d3 1768 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
8bdd16b4 1769 if ytplayer_config:
1770 args = ytplayer_config.get('args', {})
4c76aa06 1771 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
d8d24a92
S
1772 # Convert to the same format returned by compat_parse_qs
1773 video_info = dict((k, [v]) for k, v in args.items())
1774 add_dash_mpd(video_info)
6496ccb4
S
1775 # Rental video is not rented but preview is available (e.g.
1776 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
067aa17e 1777 # https://github.com/ytdl-org/youtube-dl/issues/10532)
6496ccb4
S
1778 if not video_info and args.get('ypc_vid'):
1779 return self.url_result(
1780 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
2fe1ff85
JMF
1781 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1782 is_live = True
dbdaaa23 1783 if not player_response:
c2d125d9 1784 player_response = extract_player_response(args.get('player_response'), video_id)
0a3cf9ad 1785 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
561b456e 1786 add_dash_mpd_pr(player_response)
bbb7c3f7 1787
8bdd16b4 1788 if not video_info and not player_response:
1789 player_response = extract_player_response(
1790 self._search_regex(
1791 r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;', video_webpage,
1792 'initial player response', default='{}'),
1793 video_id)
1794
bbb7c3f7 1795 def extract_unavailable_message():
0add33ab
S
1796 messages = []
1797 for tag, kind in (('h1', 'message'), ('div', 'submessage')):
1798 msg = self._html_search_regex(
1799 r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
1800 video_webpage, 'unavailable %s' % kind, default=None)
1801 if msg:
1802 messages.append(msg)
1803 if messages:
1804 return '\n'.join(messages)
bbb7c3f7 1805
f93abcf1 1806 if not video_info and not player_response:
15be3eb5
RA
1807 unavailable_message = extract_unavailable_message()
1808 if not unavailable_message:
1809 unavailable_message = 'Unable to extract video data'
1810 raise ExtractorError(
1811 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
1812
f93abcf1
S
1813 if not isinstance(video_info, dict):
1814 video_info = {}
1815
dbdaaa23
S
1816 video_details = try_get(
1817 player_response, lambda x: x['videoDetails'], dict) or {}
1818
37357d21
S
1819 microformat = try_get(
1820 player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}
1821
8dbf751a
RA
1822 video_title = video_info.get('title', [None])[0] or video_details.get('title')
1823 if not video_title:
cf7e015f
S
1824 self._downloader.report_warning('Unable to extract video title')
1825 video_title = '_'
1826
9cafc3fd 1827 description_original = video_description = get_element_by_id("eow-description", video_webpage)
cf7e015f 1828 if video_description:
fa4bc6e7
RA
1829
1830 def replace_url(m):
1831 redir_url = compat_urlparse.urljoin(url, m.group(1))
1832 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1833 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1834 qs = compat_parse_qs(parsed_redir_url.query)
1835 q = qs.get('q')
1836 if q and q[0]:
1837 return q[0]
1838 return redir_url
1839
9cafc3fd 1840 description_original = video_description = re.sub(r'''(?x)
cf7e015f 1841 <a\s+
25cb7a0e 1842 (?:[a-zA-Z-]+="[^"]*"\s+)*?
23f13e97 1843 (?:title|href)="([^"]+)"\s+
25cb7a0e 1844 (?:[a-zA-Z-]+="[^"]*"\s+)*?
525cedb9 1845 class="[^"]*"[^>]*>
23f13e97 1846 [^<]+\.{3}\s*
cf7e015f 1847 </a>
fa4bc6e7 1848 ''', replace_url, video_description)
cf7e015f
S
1849 video_description = clean_html(video_description)
1850 else:
ea74e00b
DP
1851 video_description = video_details.get('shortDescription')
1852 if video_description is None:
1853 video_description = self._html_search_meta('description', video_webpage)
cf7e015f 1854
8fe10494 1855 if not smuggled_data.get('force_singlefeed', False):
5e1eddb9 1856 if not self._downloader.params.get('noplaylist'):
8fe10494
S
1857 multifeed_metadata_list = try_get(
1858 player_response,
1859 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
1860 compat_str) or try_get(
1861 video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
1862 if multifeed_metadata_list:
1863 entries = []
1864 feed_ids = []
1865 for feed in multifeed_metadata_list.split(','):
1866 # Unquote should take place before split on comma (,) since textual
1867 # fields may contain comma as well (see
067aa17e 1868 # https://github.com/ytdl-org/youtube-dl/issues/8536)
8fe10494 1869 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
6b09401b
S
1870
1871 def feed_entry(name):
1872 return try_get(feed_data, lambda x: x[name][0], compat_str)
1873
1874 feed_id = feed_entry('id')
1875 if not feed_id:
1876 continue
1877 feed_title = feed_entry('title')
1878 title = video_title
1879 if feed_title:
1880 title += ' (%s)' % feed_title
8fe10494
S
1881 entries.append({
1882 '_type': 'url_transparent',
1883 'ie_key': 'Youtube',
1884 'url': smuggle_url(
1885 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1886 {'force_singlefeed': True}),
6b09401b 1887 'title': title,
8fe10494 1888 })
6b09401b 1889 feed_ids.append(feed_id)
8fe10494
S
1890 self.to_screen(
1891 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1892 % (', '.join(feed_ids), video_id))
1893 return self.playlist_result(entries, video_id, video_title, video_description)
1894 else:
1895 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
cf7e015f 1896
c7121fa7 1897 if view_count is None:
1c9c8de2 1898 view_count = extract_view_count(video_info)
dbdaaa23
S
1899 if view_count is None and video_details:
1900 view_count = int_or_none(video_details.get('viewCount'))
7b16239a
S
1901 if view_count is None and microformat:
1902 view_count = int_or_none(microformat.get('viewCount'))
1d699755 1903
27019dbb 1904 if is_live is None:
898238e9 1905 is_live = bool_or_none(video_details.get('isLive'))
27019dbb 1906
321bf820 1907 has_live_chat_replay = False
f0f76a33 1908 if not is_live:
321bf820 1909 yt_initial_data = self._get_yt_initial_data(video_id, video_webpage)
1910 try:
1911 yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
1912 has_live_chat_replay = True
f0f76a33 1913 except (KeyError, IndexError, TypeError):
321bf820 1914 pass
1915
c5e8d7af
PH
1916 # Check for "rental" videos
1917 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
067aa17e 1918 raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
c5e8d7af 1919
c63ca0ee
S
1920 def _extract_filesize(media_url):
1921 return int_or_none(self._search_regex(
1922 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
1923
bf1317d2
S
1924 streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
1925 streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
1926
c5e8d7af
PH
1927 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1928 self.report_rtmp_download()
dd27fd17
PH
1929 formats = [{
1930 'format_id': '_rtmp',
1931 'protocol': 'rtmp',
1932 'url': video_info['conn'][0],
1933 'player_url': player_url,
1934 }]
bf1317d2 1935 elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
5f6a1245 1936 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 1937 if 'rtmpe%3Dyes' in encoded_url_map:
067aa17e 1938 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
bf1317d2 1939 formats = []
3318832e 1940 formats_spec = {}
82156fdb 1941 fmt_list = video_info.get('fmt_list', [''])[0]
1942 if fmt_list:
1943 for fmt in fmt_list.split(','):
1944 spec = fmt.split('/')
3318832e 1945 if len(spec) > 1:
1946 width_height = spec[1].split('x')
1947 if len(width_height) == 2:
1948 formats_spec[spec[0]] = {
1949 'resolution': spec[1],
1950 'width': int_or_none(width_height[0]),
1951 'height': int_or_none(width_height[1]),
1952 }
bf1317d2
S
1953 for fmt in streaming_formats:
1954 itag = str_or_none(fmt.get('itag'))
1955 if not itag:
201e9eaa 1956 continue
bf1317d2
S
1957 quality = fmt.get('quality')
1958 quality_label = fmt.get('qualityLabel') or quality
1959 formats_spec[itag] = {
1960 'asr': int_or_none(fmt.get('audioSampleRate')),
1961 'filesize': int_or_none(fmt.get('contentLength')),
1962 'format_note': quality_label,
1963 'fps': int_or_none(fmt.get('fps')),
1964 'height': int_or_none(fmt.get('height')),
bf1317d2
S
1965 # bitrate for itag 43 is always 2147483647
1966 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
1967 'width': int_or_none(fmt.get('width')),
1968 }
1969
1970 for fmt in streaming_formats:
00eb865b 1971 if fmt.get('drmFamilies') or fmt.get('drm_families'):
bf1317d2
S
1972 continue
1973 url = url_or_none(fmt.get('url'))
1974
1975 if not url:
fa3db383 1976 cipher = fmt.get('cipher') or fmt.get('signatureCipher')
bf1317d2
S
1977 if not cipher:
1978 continue
1979 url_data = compat_parse_qs(cipher)
1980 url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
1981 if not url:
1982 continue
1983 else:
1984 cipher = None
1985 url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
1986
2f483bc1
S
1987 stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
1988 # Unsupported FORMAT_STREAM_TYPE_OTF
1989 if stream_type == 3:
1990 continue
6449cd80 1991
bf1317d2
S
1992 format_id = fmt.get('itag') or url_data['itag'][0]
1993 if not format_id:
1994 continue
1995 format_id = compat_str(format_id)
a49eccdf 1996
bf1317d2
S
1997 if cipher:
1998 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
8bdd16b4 1999 ASSETS_RE = (
2000 r'<script[^>]+\bsrc=("[^"]+")[^>]+\bname=["\']player_ias/base',
2001 r'"jsUrl"\s*:\s*("[^"]+")',
2002 r'"assets":.+?"js":\s*("[^"]+")')
bf1317d2
S
2003 jsplayer_url_json = self._search_regex(
2004 ASSETS_RE,
2005 embed_webpage if age_gate else video_webpage,
2006 'JS player URL (1)', default=None)
2007 if not jsplayer_url_json and not age_gate:
2008 # We need the embed website after all
2009 if embed_webpage is None:
2010 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
2011 embed_webpage = self._download_webpage(
2012 embed_url, video_id, 'Downloading embed webpage')
2013 jsplayer_url_json = self._search_regex(
2014 ASSETS_RE, embed_webpage, 'JS player URL')
2015
2016 player_url = json.loads(jsplayer_url_json)
cf010131 2017 if player_url is None:
bf1317d2
S
2018 player_url_json = self._search_regex(
2019 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
2020 video_webpage, 'age gate player URL')
2021 player_url = json.loads(player_url_json)
2022
2023 if 'sig' in url_data:
2024 url += '&signature=' + url_data['sig'][0]
2025 elif 's' in url_data:
2026 encrypted_sig = url_data['s'][0]
2027
2028 if self._downloader.params.get('verbose'):
2029 if player_url is None:
bf1317d2 2030 player_desc = 'unknown'
cf010131 2031 else:
e40c758c
S
2032 player_type, player_version = self._extract_player_info(player_url)
2033 player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version)
bf1317d2
S
2034 parts_sizes = self._signature_cache_id(encrypted_sig)
2035 self.to_screen('{%s} signature length %s, %s' %
2036 (format_id, parts_sizes, player_desc))
2037
2038 signature = self._decrypt_signature(
2039 encrypted_sig, video_id, player_url, age_gate)
2040 sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
2041 url += '&%s=%s' % (sp, signature)
201e9eaa
PH
2042 if 'ratebypass' not in url:
2043 url += '&ratebypass=yes'
c9afb51c 2044
94278f72
YCH
2045 dct = {
2046 'format_id': format_id,
2047 'url': url,
2048 'player_url': player_url,
2049 }
2050 if format_id in self._formats:
2051 dct.update(self._formats[format_id])
3318832e 2052 if format_id in formats_spec:
2053 dct.update(formats_spec[format_id])
94278f72 2054
aabc2be6 2055 # Some itags are not included in DASH manifest thus corresponding formats will
067aa17e 2056 # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).
aabc2be6
S
2057 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
2058 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
2059 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
94278f72 2060
bf1317d2
S
2061 if width is None:
2062 width = int_or_none(fmt.get('width'))
2063 if height is None:
2064 height = int_or_none(fmt.get('height'))
2065
c63ca0ee
S
2066 filesize = int_or_none(url_data.get(
2067 'clen', [None])[0]) or _extract_filesize(url)
2068
bf1317d2
S
2069 quality = url_data.get('quality', [None])[0] or fmt.get('quality')
2070 quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
2071
4878759f
S
2072 tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
2073 or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
bf1317d2 2074 fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
54fc90aa 2075
94278f72 2076 more_fields = {
c63ca0ee 2077 'filesize': filesize,
bf1317d2 2078 'tbr': tbr,
c9afb51c
AH
2079 'width': width,
2080 'height': height,
bf1317d2
S
2081 'fps': fps,
2082 'format_note': quality_label or quality,
c9afb51c 2083 }
94278f72
YCH
2084 for key, value in more_fields.items():
2085 if value:
2086 dct[key] = value
bf1317d2 2087 type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
aabc2be6
S
2088 if type_:
2089 type_split = type_.split(';')
2090 kind_ext = type_split[0].split('/')
2091 if len(kind_ext) == 2:
94278f72
YCH
2092 kind, _ = kind_ext
2093 dct['ext'] = mimetype2ext(type_split[0])
aabc2be6
S
2094 if kind in ('audio', 'video'):
2095 codecs = None
2096 for mobj in re.finditer(
2097 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
2098 if mobj.group('key') == 'codecs':
2099 codecs = mobj.group('val')
2100 break
2101 if codecs:
6310acf5 2102 dct.update(parse_codecs(codecs))
e4a60912
S
2103 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
2104 dct['downloader_options'] = {
2105 # Youtube throttles chunks >~10M
2106 'http_chunk_size': 10485760,
2107 }
aabc2be6 2108 formats.append(dct)
c5e8d7af 2109 else:
c3e54389
S
2110 manifest_url = (
2111 url_or_none(try_get(
2112 player_response,
2113 lambda x: x['streamingData']['hlsManifestUrl'],
3089bc74
S
2114 compat_str))
2115 or url_or_none(try_get(
c3e54389
S
2116 video_info, lambda x: x['hlsvp'][0], compat_str)))
2117 if manifest_url:
2118 formats = []
2119 m3u8_formats = self._extract_m3u8_formats(
2120 manifest_url, video_id, 'mp4', fatal=False)
2121 for a_format in m3u8_formats:
2122 itag = self._search_regex(
2123 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
2124 if itag:
2125 a_format['format_id'] = itag
2126 if itag in self._formats:
2127 dct = self._formats[itag].copy()
2128 dct.update(a_format)
2129 a_format = dct
2130 a_format['player_url'] = player_url
2131 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
2132 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
78895bd3
U
2133 if self._downloader.params.get('youtube_include_hls_manifest', True):
2134 formats.append(a_format)
c3e54389 2135 else:
13577349 2136 error_message = extract_unavailable_message()
c3e54389 2137 if not error_message:
13577349
S
2138 error_message = clean_html(try_get(
2139 player_response, lambda x: x['playabilityStatus']['reason'],
2140 compat_str))
2141 if not error_message:
2142 error_message = clean_html(
2143 try_get(video_info, lambda x: x['reason'][0], compat_str))
c3e54389
S
2144 if error_message:
2145 raise ExtractorError(error_message, expected=True)
2146 raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 2147
7e72694b 2148 # uploader
dbdaaa23
S
2149 video_uploader = try_get(
2150 video_info, lambda x: x['author'][0],
2151 compat_str) or str_or_none(video_details.get('author'))
7e72694b
S
2152 if video_uploader:
2153 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
2154 else:
2155 self._downloader.report_warning('unable to extract uploader name')
2156
2157 # uploader_id
2158 video_uploader_id = None
2159 video_uploader_url = None
2160 mobj = re.search(
2161 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
2162 video_webpage)
2163 if mobj is not None:
2164 video_uploader_id = mobj.group('uploader_id')
2165 video_uploader_url = mobj.group('uploader_url')
a6211d23
S
2166 else:
2167 owner_profile_url = url_or_none(microformat.get('ownerProfileUrl'))
2168 if owner_profile_url:
2169 video_uploader_id = self._search_regex(
2170 r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id',
2171 default=None)
2172 video_uploader_url = owner_profile_url
7e72694b 2173
b45a9e69 2174 channel_id = (
3089bc74
S
2175 str_or_none(video_details.get('channelId'))
2176 or self._html_search_meta(
2177 'channelId', video_webpage, 'channel id', default=None)
2178 or self._search_regex(
b45a9e69 2179 r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
2180 video_webpage, 'channel id', default=None, group='id'))
dd4c4492
S
2181 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
2182
b477fc13
S
2183 thumbnails = []
2184 thumbnails_list = try_get(
2185 video_details, lambda x: x['thumbnail']['thumbnails'], list) or []
2186 for t in thumbnails_list:
2187 if not isinstance(t, dict):
2188 continue
2189 thumbnail_url = url_or_none(t.get('url'))
2190 if not thumbnail_url:
2191 continue
2192 thumbnails.append({
2193 'url': thumbnail_url,
2194 'width': int_or_none(t.get('width')),
2195 'height': int_or_none(t.get('height')),
2196 })
2197
2198 if not thumbnails:
7e72694b 2199 video_thumbnail = None
b477fc13
S
2200 # We try first to get a high quality image:
2201 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
2202 video_webpage, re.DOTALL)
2203 if m_thumb is not None:
2204 video_thumbnail = m_thumb.group(1)
2205 thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str)
2206 if thumbnail_url:
2207 video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url)
2208 if video_thumbnail:
2209 thumbnails.append({'url': video_thumbnail})
7e72694b
S
2210
2211 # upload date
2212 upload_date = self._html_search_meta(
2213 'datePublished', video_webpage, 'upload date', default=None)
2214 if not upload_date:
2215 upload_date = self._search_regex(
2216 [r'(?s)id="eow-date.*?>(.*?)</span>',
2217 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
2218 video_webpage, 'upload date', default=None)
37357d21
S
2219 if not upload_date:
2220 upload_date = microformat.get('publishDate') or microformat.get('uploadDate')
7e72694b
S
2221 upload_date = unified_strdate(upload_date)
2222
2223 video_license = self._html_search_regex(
2224 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
2225 video_webpage, 'license', default=None)
2226
2227 m_music = re.search(
2228 r'''(?x)
2229 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
2230 <ul[^>]*>\s*
2231 <li>(?P<title>.+?)
2232 by (?P<creator>.+?)
2233 (?:
2234 \(.+?\)|
2235 <a[^>]*
2236 (?:
2237 \bhref=["\']/red[^>]*>| # drop possible
2238 >\s*Listen ad-free with YouTube Red # YouTube Red ad
2239 )
2240 .*?
2241 )?</li
2242 ''',
2243 video_webpage)
2244 if m_music:
2245 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
2246 video_creator = clean_html(m_music.group('creator'))
2247 else:
2248 video_alt_title = video_creator = None
2249
2250 def extract_meta(field):
2251 return self._html_search_regex(
2252 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
2253 video_webpage, field, default=None)
2254
2255 track = extract_meta('Song')
2256 artist = extract_meta('Artist')
92bc97d3 2257 album = extract_meta('Album')
822b9d9c
RA
2258
2259 # Youtube Music Auto-generated description
92bc97d3 2260 release_date = release_year = None
822b9d9c
RA
2261 if video_description:
2262 mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description)
2263 if mobj:
2264 if not track:
2265 track = mobj.group('track').strip()
2266 if not artist:
2267 artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))
92bc97d3
RA
2268 if not album:
2269 album = mobj.group('album'.strip())
822b9d9c
RA
2270 release_year = mobj.group('release_year')
2271 release_date = mobj.group('release_date')
2272 if release_date:
2273 release_date = release_date.replace('-', '')
2274 if not release_year:
2275 release_year = int(release_date[:4])
2276 if release_year:
2277 release_year = int(release_year)
7e72694b 2278
9322f116 2279 yt_initial = self._get_yt_initial_data(video_id, video_webpage)
2280 if yt_initial:
2281 music_metadata = self._get_music_metadata_from_yt_initial(yt_initial)
2282 if len(music_metadata):
2283 album = music_metadata[0].get('album')
2284 artist = music_metadata[0].get('artist')
2285 track = music_metadata[0].get('track')
2286
7e72694b
S
2287 m_episode = re.search(
2288 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
2289 video_webpage)
2290 if m_episode:
c2dd2dc0 2291 series = unescapeHTML(m_episode.group('series'))
7e72694b
S
2292 season_number = int(m_episode.group('season'))
2293 episode_number = int(m_episode.group('episode'))
2294 else:
2295 series = season_number = episode_number = None
2296
2297 m_cat_container = self._search_regex(
2298 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
2299 video_webpage, 'categories', default=None)
dbeafce5 2300 category = None
7e72694b
S
2301 if m_cat_container:
2302 category = self._html_search_regex(
2303 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
2304 default=None)
dbeafce5
S
2305 if not category:
2306 category = try_get(
2307 microformat, lambda x: x['category'], compat_str)
2308 video_categories = None if category is None else [category]
7e72694b
S
2309
2310 video_tags = [
2311 unescapeHTML(m.group('content'))
2312 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
dbeafce5
S
2313 if not video_tags:
2314 video_tags = try_get(video_details, lambda x: x['keywords'], list)
7e72694b
S
2315
2316 def _extract_count(count_name):
2317 return str_to_int(self._search_regex(
8bdd16b4 2318 r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
7e72694b
S
2319 % re.escape(count_name),
2320 video_webpage, count_name, default=None))
2321
2322 like_count = _extract_count('like')
2323 dislike_count = _extract_count('dislike')
2324
dbdaaa23
S
2325 if view_count is None:
2326 view_count = str_to_int(self._search_regex(
2327 r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
2328 'view count', default=None))
2329
bf3c9326
S
2330 average_rating = (
2331 float_or_none(video_details.get('averageRating'))
2332 or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
2333
7e72694b 2334 # subtitles
321bf820 2335 video_subtitles = self.extract_subtitles(
2336 video_id, video_webpage, has_live_chat_replay)
7e72694b
S
2337 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
2338
2339 video_duration = try_get(
2340 video_info, lambda x: int_or_none(x['length_seconds'][0]))
dbdaaa23
S
2341 if not video_duration:
2342 video_duration = int_or_none(video_details.get('lengthSeconds'))
7e72694b
S
2343 if not video_duration:
2344 video_duration = parse_duration(self._html_search_meta(
2345 'duration', video_webpage, 'video duration'))
2346
b84071c0
JP
2347 # Get Subscriber Count of channel
2348 subscriber_count = parse_count(self._search_regex(
2349 r'"text":"([\d\.]+\w?) subscribers"',
2350 video_webpage,
2351 'subscriber count',
2352 default=None
2353 ))
2354
7e72694b
S
2355 # annotations
2356 video_annotations = None
2357 if self._downloader.params.get('writeannotations', False):
64b6a4e9
RA
2358 xsrf_token = self._search_regex(
2359 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2',
2360 video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
2361 invideo_url = try_get(
2362 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
2363 if xsrf_token and invideo_url:
2364 xsrf_field_name = self._search_regex(
2365 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
2366 video_webpage, 'xsrf field name',
2367 group='xsrf_field_name', default='session_token')
2368 video_annotations = self._download_webpage(
2369 self._proto_relative_url(invideo_url),
2370 video_id, note='Downloading annotations',
2371 errnote='Unable to download video annotations', fatal=False,
2372 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
7e72694b 2373
84213ea8 2374 chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
7e72694b 2375
dd27fd17 2376 # Look for the DASH manifest
203fb43f 2377 if self._downloader.params.get('youtube_include_dash_manifest', True):
77c6fb5b 2378 dash_mpd_fatal = True
8ff648e4 2379 for mpd_url in dash_mpds:
d8d24a92 2380 dash_formats = {}
774e208f 2381 try:
05d0d131
YCH
2382 def decrypt_sig(mobj):
2383 s = mobj.group(1)
2384 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2385 return '/signature/%s' % dec_s
2386
8ff648e4 2387 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2d2fa82d 2388
8ff648e4 2389 for df in self._extract_mpd_formats(
2390 mpd_url, video_id, fatal=dash_mpd_fatal,
2391 formats_dict=self._formats):
c63ca0ee
S
2392 if not df.get('filesize'):
2393 df['filesize'] = _extract_filesize(df['url'])
d8d24a92
S
2394 # Do not overwrite DASH format found in some previous DASH manifest
2395 if df['format_id'] not in dash_formats:
2396 dash_formats[df['format_id']] = df
77c6fb5b
S
2397 # Additional DASH manifests may end up in HTTP Error 403 therefore
2398 # allow them to fail without bug report message if we already have
2399 # some DASH manifest succeeded. This is temporary workaround to reduce
2400 # burst of bug reports until we figure out the reason and whether it
2401 # can be fixed at all.
2402 dash_mpd_fatal = False
774e208f
PH
2403 except (ExtractorError, KeyError) as e:
2404 self.report_warning(
2405 'Skipping DASH manifest: %r' % e, video_id)
d8d24a92 2406 if dash_formats:
04b3b3df
JMF
2407 # Remove the formats we found through non-DASH, they
2408 # contain less info and it can be wrong, because we use
2409 # fixed values (for example the resolution). See
067aa17e 2410 # https://github.com/ytdl-org/youtube-dl/issues/5774 for an
04b3b3df 2411 # example.
d80265cc 2412 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
d8d24a92 2413 formats.extend(dash_formats.values())
d80044c2 2414
6271f1ca
PH
2415 # Check for malformed aspect ratio
2416 stretched_m = re.search(
2417 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2418 video_webpage)
2419 if stretched_m:
313dfc45
LL
2420 w = float(stretched_m.group('w'))
2421 h = float(stretched_m.group('h'))
5faf9fed
S
2422 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2423 # We will only process correct ratios.
313dfc45 2424 if w > 0 and h > 0:
41f24c32 2425 ratio = w / h
313dfc45
LL
2426 for f in formats:
2427 if f.get('vcodec') != 'none':
2428 f['stretched_ratio'] = ratio
6271f1ca 2429
026fbedc 2430 if not formats:
43ebf77d
S
2431 if 'reason' in video_info:
2432 if 'The uploader has not made this video available in your country.' in video_info['reason']:
2433 regions_allowed = self._html_search_meta(
2434 'regionsAllowed', video_webpage, default=None)
2435 countries = regions_allowed.split(',') if regions_allowed else None
2436 self.raise_geo_restricted(
2437 msg=video_info['reason'][0], countries=countries)
2438 reason = video_info['reason'][0]
2439 if 'Invalid parameters' in reason:
2440 unavailable_message = extract_unavailable_message()
2441 if unavailable_message:
2442 reason = unavailable_message
2443 raise ExtractorError(
2444 'YouTube said: %s' % reason,
2445 expected=True, video_id=video_id)
2446 if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']):
2447 raise ExtractorError('This video is DRM protected.', expected=True)
0d297518 2448
4bcc7bd1 2449 self._sort_formats(formats)
4ea3be0a 2450
21c340b8 2451 self.mark_watched(video_id, video_info, player_response)
d77ab8e2 2452
4ea3be0a 2453 return {
8bcc8756
JW
2454 'id': video_id,
2455 'uploader': video_uploader,
2456 'uploader_id': video_uploader_id,
fd050249 2457 'uploader_url': video_uploader_url,
dd4c4492
S
2458 'channel_id': channel_id,
2459 'channel_url': channel_url,
8bcc8756 2460 'upload_date': upload_date,
7caf9830 2461 'license': video_license,
936784b2 2462 'creator': video_creator or artist,
8bcc8756 2463 'title': video_title,
936784b2 2464 'alt_title': video_alt_title or track,
b477fc13 2465 'thumbnails': thumbnails,
8bcc8756
JW
2466 'description': video_description,
2467 'categories': video_categories,
000b6b5a 2468 'tags': video_tags,
8bcc8756 2469 'subtitles': video_subtitles,
360e1ca5 2470 'automatic_captions': automatic_captions,
8bcc8756
JW
2471 'duration': video_duration,
2472 'age_limit': 18 if age_gate else 0,
2473 'annotations': video_annotations,
9cafc3fd 2474 'chapters': chapters,
7e8c0af0 2475 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 2476 'view_count': view_count,
4ea3be0a 2477 'like_count': like_count,
2478 'dislike_count': dislike_count,
bf3c9326 2479 'average_rating': average_rating,
8bcc8756 2480 'formats': formats,
2fe1ff85 2481 'is_live': is_live,
7c80519c 2482 'start_time': start_time,
297a564b 2483 'end_time': end_time,
12afdc2a
S
2484 'series': series,
2485 'season_number': season_number,
2486 'episode_number': episode_number,
936784b2
S
2487 'track': track,
2488 'artist': artist,
5caabd3c 2489 'album': album,
2490 'release_date': release_date,
2491 'release_year': release_year,
b84071c0 2492 'subscriber_count': subscriber_count,
4ea3be0a 2493 }
c5e8d7af 2494
5f6a1245 2495
8bdd16b4 2496class YoutubeTabIE(YoutubeBaseInfoExtractor):
2497 IE_DESC = 'YouTube.com tab'
2498 _VALID_URL = r'https?://(?:\w+\.)?(?:youtube(?:kids)?\.com|invidio\.us)/(?:(?:channel|c|user)/|(?:playlist|watch)\?.*?\blist=)(?P<id>[^/?#&]+)'
2499 IE_NAME = 'youtube:tab'
2500
81127aa5 2501 _TESTS = [{
8bdd16b4 2502 # playlists, multipage
2503 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid',
2504 'playlist_mincount': 94,
2505 'info_dict': {
2506 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2507 'title': 'Игорь Клейнер - Playlists',
2508 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
2509 },
2510 }, {
2511 # playlists, multipage, different order
2512 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
2513 'playlist_mincount': 94,
2514 'info_dict': {
2515 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2516 'title': 'Игорь Клейнер - Playlists',
2517 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
2518 },
2519 }, {
2520 # playlists, singlepage
2521 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
2522 'playlist_mincount': 4,
2523 'info_dict': {
2524 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
2525 'title': 'ThirstForScience - Playlists',
2526 'description': 'md5:609399d937ea957b0f53cbffb747a14c',
2527 }
2528 }, {
2529 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
2530 'only_matching': True,
2531 }, {
2532 # basic, single video playlist
0e30a7b9 2533 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
81127aa5 2534 'info_dict': {
0e30a7b9 2535 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2536 'uploader': 'Sergey M.',
2537 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3867038a 2538 'title': 'youtube-dl public playlist',
81127aa5 2539 },
0e30a7b9 2540 'playlist_count': 1,
9291475f 2541 }, {
8bdd16b4 2542 # empty playlist
0e30a7b9 2543 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
9291475f 2544 'info_dict': {
0e30a7b9 2545 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2546 'uploader': 'Sergey M.',
2547 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
3867038a 2548 'title': 'youtube-dl empty playlist',
9291475f
PH
2549 },
2550 'playlist_count': 0,
2551 }, {
8bdd16b4 2552 # Home tab
2553 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured',
9291475f 2554 'info_dict': {
8bdd16b4 2555 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2556 'title': 'lex will - Home',
2557 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
9291475f 2558 },
8bdd16b4 2559 'playlist_mincount': 2,
9291475f 2560 }, {
8bdd16b4 2561 # Videos tab
2562 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos',
9291475f 2563 'info_dict': {
8bdd16b4 2564 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2565 'title': 'lex will - Videos',
2566 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
9291475f 2567 },
8bdd16b4 2568 'playlist_mincount': 975,
9291475f 2569 }, {
8bdd16b4 2570 # Videos tab, sorted by popular
2571 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid',
9291475f 2572 'info_dict': {
8bdd16b4 2573 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2574 'title': 'lex will - Videos',
2575 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
9291475f 2576 },
8bdd16b4 2577 'playlist_mincount': 199,
9291475f 2578 }, {
8bdd16b4 2579 # Playlists tab
2580 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists',
9291475f 2581 'info_dict': {
8bdd16b4 2582 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2583 'title': 'lex will - Playlists',
2584 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
9291475f 2585 },
8bdd16b4 2586 'playlist_mincount': 17,
ac7553d0 2587 }, {
8bdd16b4 2588 # Community tab
2589 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community',
ac7553d0 2590 'info_dict': {
8bdd16b4 2591 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2592 'title': 'lex will - Community',
2593 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2594 },
2595 'playlist_mincount': 18,
87dadd45 2596 }, {
8bdd16b4 2597 # Channels tab
2598 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels',
87dadd45 2599 'info_dict': {
8bdd16b4 2600 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2601 'title': 'lex will - Channels',
2602 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2603 },
2604 'playlist_mincount': 138,
6b08cdf6 2605 }, {
8bdd16b4 2606 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',
2607 'only_matching': True,
2608 }, {
2609 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA',
2610 'only_matching': True,
2611 }, {
2612 'url': 'https://music.youtube.com/channel/UCT-K0qO8z6NzWrywqefBPBQ',
2613 'only_matching': True,
2614 }, {
2615 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2616 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2617 'info_dict': {
2618 'title': '29C3: Not my department',
2619 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2620 'uploader': 'Christiaan008',
2621 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
2622 },
2623 'playlist_count': 96,
2624 }, {
2625 'note': 'Large playlist',
2626 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
6b08cdf6 2627 'info_dict': {
8bdd16b4 2628 'title': 'Uploads from Cauchemar',
2629 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
2630 'uploader': 'Cauchemar',
2631 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
13a75688 2632 },
8bdd16b4 2633 'playlist_mincount': 1123,
2634 }, {
2635 # even larger playlist, 8832 videos
2636 'url': 'http://www.youtube.com/user/NASAgovVideo/videos',
2637 'only_matching': True,
4b7df0d3
JMF
2638 }, {
2639 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2640 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2641 'info_dict': {
acf757f4
PH
2642 'title': 'Uploads from Interstellar Movie',
2643 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
13a75688 2644 'uploader': 'Interstellar Movie',
8bdd16b4 2645 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
4b7df0d3 2646 },
481cc733 2647 'playlist_mincount': 21,
8bdd16b4 2648 }, {
2649 # https://github.com/ytdl-org/youtube-dl/issues/21844
2650 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2651 'info_dict': {
2652 'title': 'Data Analysis with Dr Mike Pound',
2653 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2654 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
2655 'uploader': 'Computerphile',
2656 },
2657 'playlist_mincount': 11,
2658 }, {
2659 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',
2660 'only_matching': True,
dacb3a86
S
2661 }, {
2662 # Playlist URL that does not actually serve a playlist
2663 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2664 'info_dict': {
2665 'id': 'FqZTN594JQw',
2666 'ext': 'webm',
2667 'title': "Smiley's People 01 detective, Adventure Series, Action",
2668 'uploader': 'STREEM',
2669 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
ec85ded8 2670 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
dacb3a86
S
2671 'upload_date': '20150526',
2672 'license': 'Standard YouTube License',
2673 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2674 'categories': ['People & Blogs'],
2675 'tags': list,
dbdaaa23 2676 'view_count': int,
dacb3a86
S
2677 'like_count': int,
2678 'dislike_count': int,
2679 },
2680 'params': {
2681 'skip_download': True,
2682 },
13a75688 2683 'skip': 'This video is not available.',
dacb3a86 2684 'add_ie': [YoutubeIE.ie_key()],
481cc733 2685 }, {
8bdd16b4 2686 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
c0345b82 2687 'only_matching': True,
66b48727 2688 }, {
8bdd16b4 2689 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
66b48727 2690 'only_matching': True,
81127aa5 2691 }]
c5e8d7af 2692
8bdd16b4 2693 @classmethod
2694 def suitable(cls, url):
2695 return False if YoutubeLiveIE.suitable(url) else super(
2696 YoutubeTabIE, cls).suitable(url)
2697
2698 def _extract_channel_id(self, webpage):
2699 channel_id = self._html_search_meta(
2700 'channelId', webpage, 'channel id', default=None)
2701 if channel_id:
2702 return channel_id
2703 channel_url = self._html_search_meta(
2704 ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url',
2705 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad',
2706 'twitter:app:url:googleplay'), webpage, 'channel url')
2707 return self._search_regex(
2708 r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+',
2709 channel_url, 'channel id')
15f6397c 2710
8bdd16b4 2711 @staticmethod
2712 def _extract_grid_item_renderer(item):
2713 for item_kind in ('Playlist', 'Video', 'Channel'):
2714 renderer = item.get('grid%sRenderer' % item_kind)
2715 if renderer:
2716 return renderer
2717
2718 def _extract_video(self, renderer):
2719 video_id = renderer.get('videoId')
2720 title = try_get(
2721 renderer,
2722 (lambda x: x['title']['runs'][0]['text'],
2723 lambda x: x['title']['simpleText']), compat_str)
2724 description = try_get(
2725 renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'],
2726 compat_str)
2727 duration = parse_duration(try_get(
2728 renderer, lambda x: x['lengthText']['simpleText'], compat_str))
2729 view_count_text = try_get(
2730 renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
2731 view_count = str_to_int(self._search_regex(
2732 r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
2733 'view count', default=None))
2734 uploader = try_get(
2735 renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
2736 return {
2737 '_type': 'url_transparent',
2738 'ie_key': YoutubeIE.ie_key(),
2739 'id': video_id,
2740 'url': video_id,
2741 'title': title,
2742 'description': description,
2743 'duration': duration,
2744 'view_count': view_count,
2745 'uploader': uploader,
2746 }
652cdaa2 2747
8bdd16b4 2748 def _grid_entries(self, grid_renderer):
2749 for item in grid_renderer['items']:
2750 if not isinstance(item, dict):
39b62db1 2751 continue
8bdd16b4 2752 renderer = self._extract_grid_item_renderer(item)
2753 if not isinstance(renderer, dict):
2754 continue
2755 title = try_get(
2756 renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
2757 # playlist
2758 playlist_id = renderer.get('playlistId')
2759 if playlist_id:
2760 yield self.url_result(
2761 'https://www.youtube.com/playlist?list=%s' % playlist_id,
2762 ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
2763 video_title=title)
2764 # video
2765 video_id = renderer.get('videoId')
2766 if video_id:
2767 yield self._extract_video(renderer)
2768 # channel
2769 channel_id = renderer.get('channelId')
2770 if channel_id:
2771 title = try_get(
2772 renderer, lambda x: x['title']['simpleText'], compat_str)
2773 yield self.url_result(
2774 'https://www.youtube.com/channel/%s' % channel_id,
2775 ie=YoutubeTabIE.ie_key(), video_title=title)
2776
2777 def _shelf_entries_trimmed(self, shelf_renderer):
2778 renderer = try_get(
2779 shelf_renderer, lambda x: x['content']['horizontalListRenderer'], dict)
2780 if not renderer:
2781 return
2782 # TODO: add support for nested playlists so each shelf is processed
2783 # as separate playlist
2784 # TODO: this includes only first N items
2785 for entry in self._grid_entries(renderer):
2786 yield entry
2787
2788 def _shelf_entries(self, shelf_renderer):
2789 ep = try_get(
2790 shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
2791 compat_str)
2792 shelf_url = urljoin('https://www.youtube.com', ep)
2793 if not shelf_url:
2794 return
2795 title = try_get(
2796 shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
2797 yield self.url_result(shelf_url, video_title=title)
c5e8d7af 2798
8bdd16b4 2799 def _playlist_entries(self, video_list_renderer):
2800 for content in video_list_renderer['contents']:
2801 if not isinstance(content, dict):
2802 continue
2803 renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer')
2804 if not isinstance(renderer, dict):
2805 continue
2806 video_id = renderer.get('videoId')
2807 if not video_id:
2808 continue
2809 yield self._extract_video(renderer)
07aeced6 2810
8bdd16b4 2811 def _video_entry(self, video_renderer):
2812 video_id = video_renderer.get('videoId')
2813 if video_id:
2814 return self._extract_video(video_renderer)
dacb3a86 2815
8bdd16b4 2816 def _post_thread_entries(self, post_thread_renderer):
2817 post_renderer = try_get(
2818 post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict)
2819 if not post_renderer:
2820 return
2821 # video attachment
2822 video_renderer = try_get(
2823 post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict)
2824 video_id = None
2825 if video_renderer:
2826 entry = self._video_entry(video_renderer)
2827 if entry:
2828 yield entry
2829 # inline video links
2830 runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or []
2831 for run in runs:
2832 if not isinstance(run, dict):
2833 continue
2834 ep_url = try_get(
2835 run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str)
2836 if not ep_url:
2837 continue
2838 if not YoutubeIE.suitable(ep_url):
2839 continue
2840 ep_video_id = YoutubeIE._match_id(ep_url)
2841 if video_id == ep_video_id:
2842 continue
2843 yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=video_id)
dacb3a86 2844
8bdd16b4 2845 def _post_thread_continuation_entries(self, post_thread_continuation):
2846 contents = post_thread_continuation.get('contents')
2847 if not isinstance(contents, list):
2848 return
2849 for content in contents:
2850 renderer = content.get('backstagePostThreadRenderer')
2851 if not isinstance(renderer, dict):
2852 continue
2853 for entry in self._post_thread_entries(renderer):
2854 yield entry
07aeced6 2855
8bdd16b4 2856 @staticmethod
2857 def _extract_next_continuation_data(renderer):
2858 next_continuation = try_get(
2859 renderer, lambda x: x['continuations'][0]['nextContinuationData'], dict)
2860 if not next_continuation:
2861 return
2862 continuation = next_continuation.get('continuation')
2863 if not continuation:
2864 return
2865 ctp = next_continuation.get('clickTrackingParams')
2866 return {
2867 'ctoken': continuation,
2868 'continuation': continuation,
2869 'itct': ctp,
2870 }
c5e8d7af 2871
8bdd16b4 2872 @classmethod
2873 def _extract_continuation(cls, renderer):
2874 next_continuation = cls._extract_next_continuation_data(renderer)
2875 if next_continuation:
2876 return next_continuation
2877 contents = renderer.get('contents')
2878 if not isinstance(contents, list):
2879 return
2880 for content in contents:
2881 if not isinstance(content, dict):
2882 continue
2883 continuation_ep = try_get(
2884 content, lambda x: x['continuationItemRenderer']['continuationEndpoint'],
2885 dict)
2886 if not continuation_ep:
2887 continue
2888 continuation = try_get(
2889 continuation_ep, lambda x: x['continuationCommand']['token'], compat_str)
2890 if not continuation:
2891 continue
2892 ctp = continuation_ep.get('clickTrackingParams')
2893 if not ctp:
2894 continue
2895 return {
2896 'ctoken': continuation,
2897 'continuation': continuation,
2898 'itct': ctp,
2899 }
448830ce 2900
8bdd16b4 2901 def _entries(self, tab, identity_token):
2902 continuation = None
2903 slr_contents = try_get(tab, lambda x: x['sectionListRenderer']['contents'], list) or []
2904 for slr_content in slr_contents:
2905 if not isinstance(slr_content, dict):
2906 continue
2907 is_renderer = try_get(slr_content, lambda x: x['itemSectionRenderer'], dict)
2908 if not is_renderer:
2909 continue
2910 isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
2911 for isr_content in isr_contents:
2912 if not isinstance(isr_content, dict):
2913 continue
2914 renderer = isr_content.get('playlistVideoListRenderer')
2915 if renderer:
2916 for entry in self._playlist_entries(renderer):
2917 yield entry
2918 continuation = self._extract_continuation(renderer)
2919 continue
2920 renderer = isr_content.get('gridRenderer')
2921 if renderer:
2922 for entry in self._grid_entries(renderer):
2923 yield entry
2924 continuation = self._extract_continuation(renderer)
2925 continue
2926 renderer = isr_content.get('shelfRenderer')
2927 if renderer:
2928 for entry in self._shelf_entries(renderer):
2929 yield entry
2930 continue
2931 renderer = isr_content.get('backstagePostThreadRenderer')
2932 if renderer:
2933 for entry in self._post_thread_entries(renderer):
2934 yield entry
2935 continuation = self._extract_continuation(renderer)
2936 continue
2937 renderer = isr_content.get('videoRenderer')
2938 if renderer:
2939 entry = self._video_entry(renderer)
2940 if entry:
2941 yield entry
2942
2943 if not continuation:
2944 continuation = self._extract_continuation(is_renderer)
2945
2946 headers = {
2947 'x-youtube-client-name': '1',
2948 'x-youtube-client-version': '2.20201112.04.01',
2949 }
2950 if identity_token:
2951 headers['x-youtube-identity-token'] = identity_token
ebf1b291 2952
8bdd16b4 2953 for page_num in itertools.count(1):
2954 if not continuation:
2955 break
2956 browse = self._download_json(
2957 'https://www.youtube.com/browse_ajax', None,
2958 'Downloading page %d' % page_num,
2959 headers=headers, query=continuation, fatal=False)
2960 if not browse:
2961 break
2962 response = try_get(browse, lambda x: x[1]['response'], dict)
2963 if not response:
2964 break
ebf1b291 2965
8bdd16b4 2966 continuation_contents = try_get(
2967 response, lambda x: x['continuationContents'], dict)
2968 if continuation_contents:
2969 continuation_renderer = continuation_contents.get('playlistVideoListContinuation')
2970 if continuation_renderer:
2971 for entry in self._playlist_entries(continuation_renderer):
2972 yield entry
2973 continuation = self._extract_continuation(continuation_renderer)
2974 continue
2975 continuation_renderer = continuation_contents.get('gridContinuation')
2976 if continuation_renderer:
2977 for entry in self._grid_entries(continuation_renderer):
2978 yield entry
2979 continuation = self._extract_continuation(continuation_renderer)
2980 continue
2981 continuation_renderer = continuation_contents.get('itemSectionContinuation')
2982 if continuation_renderer:
2983 for entry in self._post_thread_continuation_entries(continuation_renderer):
2984 yield entry
2985 continuation = self._extract_continuation(continuation_renderer)
2986 continue
c5e8d7af 2987
8bdd16b4 2988 continuation_items = try_get(
2989 response, lambda x: x['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems'], list)
2990 if continuation_items:
2991 continuation_item = continuation_items[0]
2992 if not isinstance(continuation_item, dict):
2993 continue
2994 renderer = continuation_item.get('playlistVideoRenderer')
2995 if renderer:
2996 video_list_renderer = {'contents': continuation_items}
2997 for entry in self._playlist_entries(video_list_renderer):
2998 yield entry
2999 continuation = self._extract_continuation(video_list_renderer)
3000 continue
e462474e 3001
8bdd16b4 3002 break
9558dcec 3003
8bdd16b4 3004 @staticmethod
3005 def _extract_selected_tab(tabs):
3006 for tab in tabs:
3007 if try_get(tab, lambda x: x['tabRenderer']['selected'], bool):
3008 return tab['tabRenderer']
2b3c2546 3009 else:
8bdd16b4 3010 raise ExtractorError('Unable to find selected tab')
b82f815f 3011
8bdd16b4 3012 @staticmethod
3013 def _extract_uploader(data):
3014 uploader = {}
3015 sidebar_renderer = try_get(
3016 data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list)
3017 if sidebar_renderer:
3018 for item in sidebar_renderer:
3019 if not isinstance(item, dict):
3020 continue
3021 renderer = item.get('playlistSidebarSecondaryInfoRenderer')
3022 if not isinstance(renderer, dict):
3023 continue
3024 owner = try_get(
3025 renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict)
3026 if owner:
3027 uploader['uploader'] = owner.get('text')
3028 uploader['uploader_id'] = try_get(
3029 owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str)
3030 uploader['uploader_url'] = urljoin(
3031 'https://www.youtube.com/',
3032 try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
3033 return uploader
3034
3035 def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
3036 selected_tab = self._extract_selected_tab(tabs)
3037 renderer = try_get(
3038 data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
3039 if renderer:
3040 channel_title = renderer.get('title') or item_id
3041 tab_title = selected_tab.get('title')
3042 title = channel_title or item_id
3043 if tab_title:
3044 title += ' - %s' % tab_title
3045 description = renderer.get('description')
3046 playlist_id = renderer.get('externalId')
3047 renderer = try_get(
3048 data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
3049 if renderer:
3050 title = renderer.get('title')
3051 description = None
3052 playlist_id = item_id
3053 playlist = self.playlist_result(
3054 self._entries(selected_tab['content'], identity_token),
3055 playlist_id=playlist_id, playlist_title=title,
3056 playlist_description=description)
3057 playlist.update(self._extract_uploader(data))
3058 return playlist
73c4ac2c 3059
8bdd16b4 3060 def _extract_from_playlist(self, item_id, data, playlist):
3061 title = playlist.get('title') or try_get(
3062 data, lambda x: x['titleText']['simpleText'], compat_str)
3063 playlist_id = playlist.get('playlistId') or item_id
3064 return self.playlist_result(
3065 self._playlist_entries(playlist), playlist_id=playlist_id,
3066 playlist_title=title)
c5e8d7af 3067
8bdd16b4 3068 def _real_extract(self, url):
3069 item_id = self._match_id(url)
3070 url = compat_urlparse.urlunparse(
3071 compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
3072 # Handle both video/playlist URLs
3073 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3074 video_id = qs.get('v', [None])[0]
3075 playlist_id = qs.get('list', [None])[0]
3076 if video_id and playlist_id:
3077 if self._downloader.params.get('noplaylist'):
3078 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
3079 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
3080 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
3081 webpage = self._download_webpage(url, item_id)
3082 identity_token = self._search_regex(
3083 r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
3084 'identity token', default=None)
3085 data = self._extract_yt_initial_data(item_id, webpage)
3086 tabs = try_get(
3087 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
3088 if tabs:
3089 return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token)
3090 playlist = try_get(
3091 data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
3092 if playlist:
3093 return self._extract_from_playlist(item_id, data, playlist)
3094 # Fallback to video extraction if no playlist alike page is recognized
3095 if video_id:
3096 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
3097 # Failed to recognize
3098 raise ExtractorError('Unable to recognize tab page')
c5e8d7af 3099
c5e8d7af 3100
8bdd16b4 3101class YoutubePlaylistIE(InfoExtractor):
3102 IE_DESC = 'YouTube.com playlists'
3103 _VALID_URL = r'''(?x)(?:
3104 (?:https?://)?
3105 (?:\w+\.)?
3106 (?:
3107 (?:
3108 youtube(?:kids)?\.com|
3109 invidio\.us|
3110 youtu\.be
3111 )
3112 /.*?\?.*?\blist=
3113 )?
3114 (?P<id>%(playlist_id)s)
3115 )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
3116 IE_NAME = 'youtube:playlist'
cdc628a4 3117 _TESTS = [{
8bdd16b4 3118 'note': 'issue #673',
3119 'url': 'PLBB231211A4F62143',
cdc628a4 3120 'info_dict': {
8bdd16b4 3121 'title': '[OLD]Team Fortress 2 (Class-based LP)',
3122 'id': 'PLBB231211A4F62143',
3123 'uploader': 'Wickydoo',
3124 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
3125 },
3126 'playlist_mincount': 29,
3127 }, {
3128 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
3129 'info_dict': {
3130 'title': 'YDL_safe_search',
3131 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
3132 },
3133 'playlist_count': 2,
3134 'skip': 'This playlist is private',
9558dcec 3135 }, {
8bdd16b4 3136 'note': 'embedded',
3137 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
3138 'playlist_count': 4,
9558dcec 3139 'info_dict': {
8bdd16b4 3140 'title': 'JODA15',
3141 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
3142 'uploader': 'milan',
3143 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
9558dcec 3144 }
cdc628a4 3145 }, {
8bdd16b4 3146 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
3147 'playlist_mincount': 982,
3148 'info_dict': {
3149 'title': '2018 Chinese New Singles (11/6 updated)',
3150 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
3151 'uploader': 'LBK',
3152 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
3153 }
daa0df9e 3154 }, {
8bdd16b4 3155 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
3156 'info_dict': {
3157 'id': 'yeWKywCrFtk',
3158 'ext': 'mp4',
3159 'title': 'Small Scale Baler and Braiding Rugs',
3160 'uploader': 'Backus-Page House Museum',
3161 'uploader_id': 'backuspagemuseum',
3162 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
3163 'upload_date': '20161008',
3164 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
3165 'categories': ['Nonprofits & Activism'],
3166 'tags': list,
3167 'like_count': int,
3168 'dislike_count': int,
3169 },
3170 'params': {
3171 'noplaylist': True,
3172 'skip_download': True,
3173 },
39e7107d 3174 }, {
8bdd16b4 3175 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
39e7107d 3176 'only_matching': True,
9558dcec 3177 }, {
8bdd16b4 3178 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
9558dcec 3179 'only_matching': True,
73c4ac2c 3180 }, {
8bdd16b4 3181 # music album playlist
3182 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
73c4ac2c 3183 'only_matching': True,
cdc628a4
PH
3184 }]
3185
e3ea4790 3186 @classmethod
f4b05232 3187 def suitable(cls, url):
8bdd16b4 3188 return False if YoutubeTabIE.suitable(url) else super(
3189 YoutubePlaylistIE, cls).suitable(url)
f4b05232 3190
8bdd16b4 3191 def _real_extract(self, url):
3192 playlist_id = self._match_id(url)
3193 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3194 if not qs:
3195 qs = {'list': playlist_id}
3196 return self.url_result(
3197 update_url_query('https://www.youtube.com/playlist', qs),
3198 ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
3199
3200
3201class YoutubeYtUserIE(InfoExtractor):
3202 _VALID_URL = r'ytuser:(?P<id>.+)'
3203 _TESTS = [{
3204 'url': 'ytuser:phihag',
3205 'only_matching': True,
3206 }]
3207
3208 def _real_extract(self, url):
3209 user_id = self._match_id(url)
3210 return self.url_result(
3211 'https://www.youtube.com/user/%s' % user_id,
3212 ie=YoutubeTabIE.ie_key(), video_id=user_id)
9558dcec 3213
b05654f0 3214
f07e276a
S
3215class YoutubeLiveIE(YoutubeBaseInfoExtractor):
3216 IE_DESC = 'YouTube.com live streams'
073d5bf5 3217 _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'
f07e276a
S
3218 IE_NAME = 'youtube:live'
3219
3220 _TESTS = [{
2d3d2997 3221 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
f07e276a
S
3222 'info_dict': {
3223 'id': 'a48o2S1cPoo',
3224 'ext': 'mp4',
3225 'title': 'The Young Turks - Live Main Show',
3226 'uploader': 'The Young Turks',
3227 'uploader_id': 'TheYoungTurks',
ec85ded8 3228 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
f07e276a
S
3229 'upload_date': '20150715',
3230 'license': 'Standard YouTube License',
3231 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
3232 'categories': ['News & Politics'],
3233 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
3234 'like_count': int,
3235 'dislike_count': int,
3236 },
3237 'params': {
3238 'skip_download': True,
3239 },
3240 }, {
2d3d2997 3241 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
f07e276a 3242 'only_matching': True,
c1b2a085
S
3243 }, {
3244 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
3245 'only_matching': True,
073d5bf5
S
3246 }, {
3247 'url': 'https://www.youtube.com/TheYoungTurks/live',
3248 'only_matching': True,
f07e276a
S
3249 }]
3250
3251 def _real_extract(self, url):
3252 mobj = re.match(self._VALID_URL, url)
3253 channel_id = mobj.group('id')
3254 base_url = mobj.group('base_url')
3255 webpage = self._download_webpage(url, channel_id, fatal=False)
3256 if webpage:
3257 page_type = self._og_search_property(
e7f3529f 3258 'type', webpage, 'page type', default='')
f07e276a
S
3259 video_id = self._html_search_meta(
3260 'videoId', webpage, 'video id', default=None)
e7f3529f
S
3261 if page_type.startswith('video') and video_id and re.match(
3262 r'^[0-9A-Za-z_-]{11}$', video_id):
f07e276a
S
3263 return self.url_result(video_id, YoutubeIE.ie_key())
3264 return self.url_result(base_url)
3265
3266
8bdd16b4 3267class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):
78caa52a 3268 IE_DESC = 'YouTube.com searches'
b4c08069
JMF
3269 # there doesn't appear to be a real limit, for example if you search for
3270 # 'python' you get more than 8.000.000 results
3271 _MAX_RESULTS = float('inf')
78caa52a 3272 IE_NAME = 'youtube:search'
b05654f0 3273 _SEARCH_KEY = 'ytsearch'
6c894ea1 3274 _SEARCH_PARAMS = None
9dd8e46a 3275 _TESTS = []
b05654f0 3276
6c894ea1
U
3277 def _entries(self, query, n):
3278 data = {
3279 'context': {
3280 'client': {
3281 'clientName': 'WEB',
3282 'clientVersion': '2.20201021.03.00',
3283 }
3284 },
3285 'query': query,
a22b2fd1 3286 }
6c894ea1
U
3287 if self._SEARCH_PARAMS:
3288 data['params'] = self._SEARCH_PARAMS
3289 total = 0
3290 for page_num in itertools.count(1):
3291 search = self._download_json(
3292 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
3293 video_id='query "%s"' % query,
3294 note='Downloading page %s' % page_num,
3295 errnote='Unable to download API page', fatal=False,
3296 data=json.dumps(data).encode('utf8'),
3297 headers={'content-type': 'application/json'})
3298 if not search:
b4c08069 3299 break
6c894ea1
U
3300 slr_contents = try_get(
3301 search,
3302 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
3303 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
3304 list)
3305 if not slr_contents:
a22b2fd1 3306 break
6c894ea1
U
3307 isr_contents = try_get(
3308 slr_contents,
3309 lambda x: x[0]['itemSectionRenderer']['contents'],
3310 list)
3311 if not isr_contents:
3312 break
3313 for content in isr_contents:
3314 if not isinstance(content, dict):
3315 continue
3316 video = content.get('videoRenderer')
3317 if not isinstance(video, dict):
3318 continue
3319 video_id = video.get('videoId')
3320 if not video_id:
3321 continue
3322 title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str)
3323 description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str)
3324 duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str))
3325 view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
3326 view_count = int_or_none(self._search_regex(
3327 r'^(\d+)', re.sub(r'\s', '', view_count_text),
3328 'view count', default=None))
3329 uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
3330 total += 1
3331 yield {
3332 '_type': 'url_transparent',
3333 'ie_key': YoutubeIE.ie_key(),
3334 'id': video_id,
3335 'url': video_id,
3336 'title': title,
3337 'description': description,
3338 'duration': duration,
3339 'view_count': view_count,
3340 'uploader': uploader,
3341 }
3342 if total == n:
3343 return
3344 token = try_get(
3345 slr_contents,
3346 lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
3347 compat_str)
3348 if not token:
3349 break
3350 data['continuation'] = token
b05654f0 3351
6c894ea1
U
3352 def _get_n_results(self, query, n):
3353 """Get a specified number of results for a query"""
3354 return self.playlist_result(self._entries(query, n), query)
75dff0ee 3355
c9ae7b95 3356
a3dd9248 3357class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 3358 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248 3359 _SEARCH_KEY = 'ytsearchdate'
78caa52a 3360 IE_DESC = 'YouTube.com searches, newest videos first'
6c894ea1 3361 _SEARCH_PARAMS = 'CAI%3D'
75dff0ee 3362
c9ae7b95 3363
8bdd16b4 3364class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639 3365 """
25f14e9f 3366 Base class for feed extractors
d7ae0639
JMF
3367 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
3368 """
b2e8bc1b 3369 _LOGIN_REQUIRED = True
d7ae0639
JMF
3370
3371 @property
3372 def IE_NAME(self):
78caa52a 3373 return 'youtube:%s' % self._FEED_NAME
04cc9617 3374
81f0259b 3375 def _real_initialize(self):
b2e8bc1b 3376 self._login()
81f0259b 3377
8bdd16b4 3378 def _entries(self, page):
3379 # The extraction process is the same as for playlists, but the regex
3380 # for the video ids doesn't contain an index
3381 ids = []
3382 more_widget_html = content_html = page
3383 for page_num in itertools.count(1):
3384 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
62c95fd5 3385
8bdd16b4 3386 # 'recommended' feed has infinite 'load more' and each new portion spins
3387 # the same videos in (sometimes) slightly different order, so we'll check
3388 # for unicity and break when portion has no new videos
3389 new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches)))
3390 if not new_ids:
3391 break
2bc43303 3392
8bdd16b4 3393 ids.extend(new_ids)
3853309f 3394
8bdd16b4 3395 for entry in self._ids_to_results(new_ids):
3396 yield entry
2bc43303 3397
8bdd16b4 3398 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
3399 if not mobj:
3400 break
3401
3402 more = self._download_json(
3403 'https://www.youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
3404 'Downloading page #%s' % page_num,
3405 transform_source=uppercase_escape,
3406 headers=self._YOUTUBE_CLIENT_HEADERS)
3407 content_html = more['content_html']
3408 more_widget_html = more['load_more_widget_html']
2bc43303 3409
3853309f
S
3410 def _real_extract(self, url):
3411 page = self._download_webpage(
3412 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
3413 self._PLAYLIST_TITLE)
8bdd16b4 3414 return self.playlist_result(
3415 self._entries(page), playlist_title=self._PLAYLIST_TITLE)
25f14e9f
S
3416
3417
8bdd16b4 3418class YoutubeWatchLaterIE(InfoExtractor):
25f14e9f
S
3419 IE_NAME = 'youtube:watchlater'
3420 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
8bdd16b4 3421 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/watch_later|:ytwatchlater'
25f14e9f 3422
bc7a9cd8 3423 _TESTS = [{
8bdd16b4 3424 'url': 'https://www.youtube.com/feed/watch_later',
bc7a9cd8
S
3425 'only_matching': True,
3426 }, {
8bdd16b4 3427 'url': ':ytwatchlater',
bc7a9cd8
S
3428 'only_matching': True,
3429 }]
25f14e9f
S
3430
3431 def _real_extract(self, url):
8bdd16b4 3432 return self.url_result(
3433 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key())
7e5dc339 3434 _, video = self._check_download_just_video(url, 'WL')
ebf1b291
S
3435 if video:
3436 return video
dacb3a86
S
3437 _, playlist = self._extract_playlist('WL')
3438 return playlist
f459d170 3439
5f6a1245 3440
25f14e9f
S
3441class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3442 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
92519402 3443 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'
25f14e9f
S
3444 _FEED_NAME = 'recommended'
3445 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1ed5b5c9 3446
1ed5b5c9 3447
25f14e9f
S
3448class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
3449 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
92519402 3450 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
25f14e9f
S
3451 _FEED_NAME = 'subscriptions'
3452 _PLAYLIST_TITLE = 'Youtube Subscriptions'
1ed5b5c9 3453
1ed5b5c9 3454
25f14e9f
S
3455class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
3456 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
92519402 3457 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
25f14e9f
S
3458 _FEED_NAME = 'history'
3459 _PLAYLIST_TITLE = 'Youtube History'
1ed5b5c9
JMF
3460
3461
15870e90
PH
3462class YoutubeTruncatedURLIE(InfoExtractor):
3463 IE_NAME = 'youtube:truncated_url'
3464 IE_DESC = False # Do not list
975d35db 3465 _VALID_URL = r'''(?x)
b95aab84
PH
3466 (?:https?://)?
3467 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3468 (?:watch\?(?:
c4808c60 3469 feature=[a-z_]+|
b95aab84
PH
3470 annotation_id=annotation_[^&]+|
3471 x-yt-cl=[0-9]+|
c1708b89 3472 hl=[^&]*|
287be8c6 3473 t=[0-9]+
b95aab84
PH
3474 )?
3475 |
3476 attribution_link\?a=[^&]+
3477 )
3478 $
975d35db 3479 '''
15870e90 3480
c4808c60 3481 _TESTS = [{
2d3d2997 3482 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
c4808c60 3483 'only_matching': True,
dc2fc736 3484 }, {
2d3d2997 3485 'url': 'https://www.youtube.com/watch?',
dc2fc736 3486 'only_matching': True,
b95aab84
PH
3487 }, {
3488 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3489 'only_matching': True,
3490 }, {
3491 'url': 'https://www.youtube.com/watch?feature=foo',
3492 'only_matching': True,
c1708b89
PH
3493 }, {
3494 'url': 'https://www.youtube.com/watch?hl=en-GB',
3495 'only_matching': True,
287be8c6
PH
3496 }, {
3497 'url': 'https://www.youtube.com/watch?t=2372',
3498 'only_matching': True,
c4808c60
PH
3499 }]
3500
15870e90
PH
3501 def _real_extract(self, url):
3502 raise ExtractorError(
78caa52a
PH
3503 'Did you forget to quote the URL? Remember that & is a meta '
3504 'character in most shells, so you want to put the URL in quotes, '
3867038a 3505 'like youtube-dl '
2d3d2997 3506 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3867038a 3507 ' or simply youtube-dl BaW_jenozKc .',
15870e90 3508 expected=True)
772fd5cc
PH
3509
3510
3511class YoutubeTruncatedIDIE(InfoExtractor):
3512 IE_NAME = 'youtube:truncated_id'
3513 IE_DESC = False # Do not list
b95aab84 3514 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
3515
3516 _TESTS = [{
3517 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3518 'only_matching': True,
3519 }]
3520
3521 def _real_extract(self, url):
3522 video_id = self._match_id(url)
3523 raise ExtractorError(
3524 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3525 expected=True)
8bdd16b4 3526
3527
3528# Old extractors. Are these cases handled elsewhere?
3529
3530class YoutubeSearchURLIE(YoutubeSearchIE):
3531 IE_DESC = 'YouTube.com search URLs'
3532 IE_NAME = 'youtube:search_url'
3533 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
3534 _TESTS = [{
3535 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
3536 'playlist_mincount': 5,
3537 'info_dict': {
3538 'title': 'youtube-dl test video',
3539 }
3540 }, {
3541 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
3542 'only_matching': True,
3543 }]
3544
3545 def _process_json_dict(self, obj, videos, c):
3546 if "videoId" in obj:
3547 videos.append(obj)
3548 return
3549
3550 if "nextContinuationData" in obj:
3551 c["continuation"] = obj["nextContinuationData"]
3552 return
3553
3554 def _real_extract(self, url):
3555 mobj = re.match(self._VALID_URL, url)
3556 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
3557 webpage = self._download_webpage(url, query)
3558 return self.playlist_result(self._entries(webpage, query, max_pages=5), playlist_title=query)
3559
3560
3561class YoutubeShowIE(InfoExtractor):
3562 IE_DESC = 'YouTube.com (multi-season) shows'
3563 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
3564 IE_NAME = 'youtube:show'
3565 _TESTS = [{
3566 'url': 'https://www.youtube.com/show/airdisasters',
3567 'playlist_mincount': 5,
3568 'info_dict': {
3569 'id': 'airdisasters',
3570 'title': 'Air Disasters',
3571 }
3572 }]
3573
3574 def _real_extract(self, url):
3575 playlist_id = self._match_id(url)
3576 return super(YoutubeShowIE, self)._real_extract(
3577 'https://www.youtube.com/show/%s/playlists' % playlist_id)
3578
3579
3580class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
3581 IE_NAME = 'youtube:favorites'
3582 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
3583 _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
3584 _LOGIN_REQUIRED = True
3585
3586 def _real_extract(self, url):
3587 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
3588 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
3589 return self.url_result(playlist_id, 'YoutubePlaylist')