]> jfr.im git - yt-dlp.git/blame - youtube_dlc/extractor/youtube.py
Merge pull request #187 from pukkandan/break-on-existing
[yt-dlp.git] / youtube_dlc / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
d77ab8e2 9import random
c5e8d7af 10import re
42939b61 11import time
e0df6211 12import traceback
c5e8d7af 13
b05654f0 14from .common import InfoExtractor, SearchInfoExtractor
2b25cb5d 15from ..jsinterp import JSInterpreter
54256267 16from ..swfinterp import SWFInterpreter
4bb4a188 17from ..compat import (
edf3e38e 18 compat_chr,
f8c55c66 19 compat_HTTPError,
8d81f3e3 20 compat_kwargs,
c5e8d7af 21 compat_parse_qs,
7fd002c0
S
22 compat_urllib_parse_unquote,
23 compat_urllib_parse_unquote_plus,
15707c7e 24 compat_urllib_parse_urlencode,
7c80519c 25 compat_urllib_parse_urlparse,
7c61bd36 26 compat_urlparse,
c5e8d7af 27 compat_str,
4bb4a188
PH
28)
29from ..utils import (
27019dbb 30 bool_or_none,
c5e8d7af 31 clean_html,
9b9c5355 32 error_to_compat_str,
351f37c0 33 extract_attributes,
c5e8d7af 34 ExtractorError,
2d30521a 35 float_or_none,
4bb4a188
PH
36 get_element_by_attribute,
37 get_element_by_id,
dd27fd17 38 int_or_none,
9833e7a0 39 js_to_json,
94278f72 40 mimetype2ext,
4bb4a188 41 orderedSet,
6310acf5 42 parse_codecs,
b84071c0 43 parse_count,
7c80519c 44 parse_duration,
0cb58b02 45 remove_quotes,
3995d37d 46 remove_start,
cf7e015f 47 smuggle_url,
dbdaaa23 48 str_or_none,
c93d53f5 49 str_to_int,
556dbe7f 50 try_get,
c5e8d7af
PH
51 unescapeHTML,
52 unified_strdate,
cf7e015f 53 unsmuggle_url,
81c2f20b 54 uppercase_escape,
21c340b8 55 url_or_none,
6e6bc8da 56 urlencode_postdata,
c5e8d7af
PH
57)
58
5f6a1245 59
de7f3446 60class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
61 """Provide base functions for Youtube extractors"""
62 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
9303ce3e 63 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
e00eb564
S
64
65 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
3995d37d
S
66 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
67 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
e00eb564 68
b2e8bc1b
JMF
69 _NETRC_MACHINE = 'youtube'
70 # If True it will raise an error if no login info is provided
71 _LOGIN_REQUIRED = False
72
66b48727 73 _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}'
9833e7a0
LR
74 _INITIAL_DATA_RE = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});'
75 _YTCFG_DATA_RE = r"ytcfg.set\(({.*?})\)"
d0ba5587 76
d84b21b4
S
77 _YOUTUBE_CLIENT_HEADERS = {
78 'x-youtube-client-name': '1',
79 'x-youtube-client-version': '1.20200609.04.02',
80 }
81
b2e8bc1b 82 def _set_language(self):
810fb84d 83 self._set_cookie(
ee0b726c 84 '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
42939b61 85 # YouTube sets the expire time to about two months
810fb84d 86 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b 87
25f14e9f
S
88 def _ids_to_results(self, ids):
89 return [
90 self.url_result(vid_id, 'Youtube', video_id=vid_id)
91 for vid_id in ids]
92
b2e8bc1b 93 def _login(self):
83317f69 94 """
95 Attempt to log in to YouTube.
96 True is returned if successful or skipped.
97 False is returned if login failed.
98
99 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
100 """
68217024 101 username, password = self._get_login_info()
b2e8bc1b
JMF
102 # No authentication to be performed
103 if username is None:
70d35d16 104 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
69ea8ca4 105 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
957c523e
U
106 if self._downloader.params.get('cookiefile') and False: # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
107 self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
83317f69 108 return True
b2e8bc1b 109
7cc3570e
PH
110 login_page = self._download_webpage(
111 self._LOGIN_URL, None,
69ea8ca4
PH
112 note='Downloading login page',
113 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
114 if login_page is False:
115 return
b2e8bc1b 116
1212e997 117 login_form = self._hidden_inputs(login_page)
c5e8d7af 118
e00eb564
S
119 def req(url, f_req, note, errnote):
120 data = login_form.copy()
121 data.update({
122 'pstMsg': 1,
123 'checkConnection': 'youtube',
124 'checkedDomains': 'youtube',
125 'hl': 'en',
126 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
3995d37d 127 'f.req': json.dumps(f_req),
e00eb564
S
128 'flowName': 'GlifWebSignIn',
129 'flowEntry': 'ServiceLogin',
baf67a60
S
130 # TODO: reverse actual botguard identifier generation algo
131 'bgRequest': '["identifier",""]',
041bc3ad 132 })
e00eb564
S
133 return self._download_json(
134 url, None, note=note, errnote=errnote,
135 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
136 fatal=False,
137 data=urlencode_postdata(data), headers={
138 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
139 'Google-Accounts-XSRF': 1,
140 })
141
3995d37d
S
142 def warn(message):
143 self._downloader.report_warning(message)
144
145 lookup_req = [
146 username,
147 None, [], None, 'US', None, None, 2, False, True,
148 [
149 None, None,
150 [2, 1, None, 1,
151 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
152 None, [], 4],
153 1, [None, None, []], None, None, None, True
154 ],
155 username,
156 ]
157
e00eb564 158 lookup_results = req(
3995d37d 159 self._LOOKUP_URL, lookup_req,
e00eb564
S
160 'Looking up account info', 'Unable to look up account info')
161
162 if lookup_results is False:
163 return False
041bc3ad 164
3995d37d
S
165 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
166 if not user_hash:
167 warn('Unable to extract user hash')
168 return False
169
170 challenge_req = [
171 user_hash,
172 None, 1, None, [1, None, None, None, [password, None, True]],
173 [
174 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
175 1, [None, None, []], None, None, None, True
176 ]]
83317f69 177
3995d37d
S
178 challenge_results = req(
179 self._CHALLENGE_URL, challenge_req,
180 'Logging in', 'Unable to log in')
83317f69 181
3995d37d 182 if challenge_results is False:
e00eb564 183 return
83317f69 184
3995d37d
S
185 login_res = try_get(challenge_results, lambda x: x[0][5], list)
186 if login_res:
187 login_msg = try_get(login_res, lambda x: x[5], compat_str)
188 warn(
189 'Unable to login: %s' % 'Invalid password'
190 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
191 return False
192
193 res = try_get(challenge_results, lambda x: x[0][-1], list)
194 if not res:
195 warn('Unable to extract result entry')
196 return False
197
9a6628aa
S
198 login_challenge = try_get(res, lambda x: x[0][0], list)
199 if login_challenge:
200 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
201 if challenge_str == 'TWO_STEP_VERIFICATION':
3995d37d
S
202 # SEND_SUCCESS - TFA code has been successfully sent to phone
203 # QUOTA_EXCEEDED - reached the limit of TFA codes
9a6628aa 204 status = try_get(login_challenge, lambda x: x[5], compat_str)
3995d37d
S
205 if status == 'QUOTA_EXCEEDED':
206 warn('Exceeded the limit of TFA codes, try later')
207 return False
208
209 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
210 if not tl:
211 warn('Unable to extract TL')
212 return False
213
214 tfa_code = self._get_tfa_info('2-step verification code')
215
216 if not tfa_code:
217 warn(
218 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
219 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
220 return False
221
222 tfa_code = remove_start(tfa_code, 'G-')
223
224 tfa_req = [
225 user_hash, None, 2, None,
226 [
227 9, None, None, None, None, None, None, None,
228 [None, tfa_code, True, 2]
229 ]]
230
231 tfa_results = req(
232 self._TFA_URL.format(tl), tfa_req,
233 'Submitting TFA code', 'Unable to submit TFA code')
234
235 if tfa_results is False:
236 return False
237
238 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
239 if tfa_res:
240 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
241 warn(
242 'Unable to finish TFA: %s' % 'Invalid TFA code'
243 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
244 return False
245
246 check_cookie_url = try_get(
247 tfa_results, lambda x: x[0][-1][2], compat_str)
9a6628aa
S
248 else:
249 CHALLENGES = {
250 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
251 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
252 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
253 }
254 challenge = CHALLENGES.get(
255 challenge_str,
256 '%s returned error %s.' % (self.IE_NAME, challenge_str))
257 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
258 return False
3995d37d
S
259 else:
260 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
261
262 if not check_cookie_url:
263 warn('Unable to extract CheckCookie URL')
264 return False
e00eb564
S
265
266 check_cookie_results = self._download_webpage(
3995d37d
S
267 check_cookie_url, None, 'Checking cookie', fatal=False)
268
269 if check_cookie_results is False:
270 return False
e00eb564 271
3995d37d
S
272 if 'https://myaccount.google.com/' not in check_cookie_results:
273 warn('Unable to log in')
b2e8bc1b 274 return False
e00eb564 275
b2e8bc1b
JMF
276 return True
277
30226342 278 def _download_webpage_handle(self, *args, **kwargs):
c1148516 279 query = kwargs.get('query', {}).copy()
c1148516 280 kwargs['query'] = query
30226342 281 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
8d81f3e3
S
282 *args, **compat_kwargs(kwargs))
283
5b0a6a80 284 def _get_yt_initial_data(self, video_id, webpage):
285 config = self._search_regex(
286 (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
287 r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
288 webpage, 'ytInitialData', default=None)
289 if config:
290 return self._parse_json(
291 uppercase_escape(config), video_id, fatal=False)
292
b2e8bc1b
JMF
293 def _real_initialize(self):
294 if self._downloader is None:
295 return
42939b61 296 self._set_language()
b2e8bc1b
JMF
297 if not self._login():
298 return
c5e8d7af 299
8377574c 300
8e7aad20 301class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
c54f4aad 302
9833e7a0
LR
303 def _find_entries_in_json(self, extracted):
304 entries = []
305 c = {}
306
307 def _real_find(obj):
308 if obj is None or isinstance(obj, str):
309 return
310
311 if type(obj) is list:
312 for elem in obj:
313 _real_find(elem)
314
315 if type(obj) is dict:
316 if self._is_entry(obj):
317 entries.append(obj)
318 return
319
320 if 'continuationCommand' in obj:
321 c['continuation'] = obj
322 return
323
324 for _, o in obj.items():
325 _real_find(o)
326
327 _real_find(extracted)
328
329 return entries, try_get(c, lambda x: x["continuation"])
330
73ac8567 331 def _entries(self, page, playlist_id, max_pages=None):
9833e7a0
LR
332 seen = []
333
334 yt_conf = {}
335 for m in re.finditer(self._YTCFG_DATA_RE, page):
336 parsed = self._parse_json(m.group(1), playlist_id,
337 transform_source=js_to_json, fatal=False)
338 if parsed:
339 yt_conf.update(parsed)
340
341 data_json = self._parse_json(self._search_regex(self._INITIAL_DATA_RE, page, 'ytInitialData'), None)
342
73ac8567 343 for page_num in range(1, max_pages + 1) if max_pages is not None else itertools.count(1):
9833e7a0
LR
344 entries, continuation = self._find_entries_in_json(data_json)
345 processed = self._process_entries(entries, seen)
346
347 if not processed:
348 break
349 for entry in processed:
061a75ed 350 yield entry
648e6a1f 351
9833e7a0
LR
352 if not continuation or not yt_conf:
353 break
354 continuation_token = try_get(continuation, lambda x: x['continuationCommand']['token'])
355 continuation_url = try_get(continuation, lambda x: x['commandMetadata']['webCommandMetadata']['apiUrl'])
356 if not continuation_token or not continuation_url:
648e6a1f
S
357 break
358
f8c55c66
S
359 count = 0
360 retries = 3
361 while count <= retries:
362 try:
363 # Downloading page may result in intermittent 5xx HTTP error
364 # that is usually worked around with a retry
9833e7a0
LR
365 data_json = self._download_json(
366 'https://www.youtube.com%s' % continuation_url,
367 playlist_id,
73ac8567 368 'Downloading continuation page #%s%s' % (page_num, ' (retry #%d)' % count if count else ''),
c54f4aad 369
d84b21b4 370 transform_source=uppercase_escape,
9833e7a0
LR
371 query={
372 'key': try_get(yt_conf, lambda x: x['INNERTUBE_API_KEY'])
373 },
5e6cdcec 374 data=str(json.dumps({
9833e7a0
LR
375 'context': try_get(yt_conf, lambda x: x['INNERTUBE_CONTEXT']),
376 'continuation': continuation_token
5e6cdcec 377 })).encode(encoding='UTF-8', errors='strict'),
9833e7a0
LR
378 headers={
379 'Content-Type': 'application/json'
380 }
381 )
f8c55c66
S
382 break
383 except ExtractorError as e:
384 if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
385 count += 1
386 if count <= retries:
387 continue
388 raise
389
9833e7a0
LR
390 def _extract_title(self, renderer):
391 title = try_get(renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
392 if title:
393 return title
394 return try_get(renderer, lambda x: x['title']['simpleText'], compat_str)
648e6a1f 395
061a75ed
S
396
397class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
9833e7a0
LR
398 def _is_entry(self, obj):
399 return 'videoId' in obj
400
401 def _process_entries(self, entries, seen):
402 ids_in_page = []
403 titles_in_page = []
404 for renderer in entries:
405 video_id = try_get(renderer, lambda x: x['videoId'])
406 video_title = self._extract_title(renderer)
061a75ed 407
9833e7a0
LR
408 if video_id is None or video_title is None:
409 # we do not have a videoRenderer or title extraction broke
648e6a1f 410 continue
9833e7a0
LR
411
412 video_title = video_title.strip()
413
648e6a1f
S
414 try:
415 idx = ids_in_page.index(video_id)
416 if video_title and not titles_in_page[idx]:
417 titles_in_page[idx] = video_title
418 except ValueError:
419 ids_in_page.append(video_id)
420 titles_in_page.append(video_title)
351f37c0 421
9833e7a0
LR
422 for video_id, video_title in zip(ids_in_page, titles_in_page):
423 yield self.url_result(video_id, 'Youtube', video_id, video_title)
648e6a1f
S
424
425
061a75ed 426class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
9833e7a0
LR
427 def _is_entry(self, obj):
428 return 'playlistId' in obj
429
430 def _process_entries(self, entries, seen):
431 for playlist_id in orderedSet(try_get(r, lambda x: x['playlistId']) for r in entries):
c54f4aad 432
061a75ed
S
433 yield self.url_result(
434 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
435
0c148415
S
436 def _real_extract(self, url):
437 playlist_id = self._match_id(url)
438 webpage = self._download_webpage(url, playlist_id)
0c148415 439 title = self._og_search_title(webpage, fatal=False)
061a75ed 440 return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
0c148415
S
441
442
360e1ca5 443class YoutubeIE(YoutubeBaseInfoExtractor):
78caa52a 444 IE_DESC = 'YouTube.com'
cb7dfeea 445 _VALID_URL = r"""(?x)^
c5e8d7af 446 (
edb53e2d 447 (?:https?://|//) # http(s):// or protocol-independent URL
66b48727 448 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
484aaeb2 449 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 450 (?:www\.)?pwnyoutube\.com/|
8b561bfc 451 (?:www\.)?hooktube\.com/|
f7000f3a 452 (?:www\.)?yourepeat\.com/|
e69ae5b9 453 tube\.majestyc\.net/|
ba036333 454 # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
77d95677 455 (?:(?:www|dev)\.)?invidio\.us/|
ba036333 456 (?:(?:www|no)\.)?invidiou\.sh/|
457 (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/|
8ae113ca 458 (?:www\.)?invidious\.kabi\.tk/|
ba036333 459 (?:www\.)?invidious\.13ad\.de/|
791d2e81 460 (?:www\.)?invidious\.mastodon\.host/|
494d664e 461 (?:www\.)?invidious\.nixnet\.xyz/|
666d808e 462 (?:www\.)?invidious\.drycat\.fr/|
ba036333 463 (?:www\.)?tube\.poal\.co/|
8ae113ca 464 (?:www\.)?vid\.wxzm\.sx/|
384bf91f 465 (?:www\.)?yewtu\.be/|
494d664e 466 (?:www\.)?yt\.elukerio\.org/|
894b3826 467 (?:www\.)?yt\.lelux\.fi/|
1db5ab6b 468 (?:www\.)?invidious\.ggc-project\.de/|
469 (?:www\.)?yt\.maisputain\.ovh/|
470 (?:www\.)?invidious\.13ad\.de/|
471 (?:www\.)?invidious\.toot\.koeln/|
472 (?:www\.)?invidious\.fdn\.fr/|
473 (?:www\.)?watch\.nettohikari\.com/|
bff90fc5 474 (?:www\.)?kgg2m7yk5aybusll\.onion/|
475 (?:www\.)?qklhadlycap4cnod\.onion/|
476 (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
477 (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
478 (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
479 (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
33c1c7d8 480 (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
1db5ab6b 481 (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
e69ae5b9 482 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
483 (?:.*?\#/)? # handle anchor (#/) redirect urls
484 (?: # the various things that can precede the ID:
ac7553d0 485 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 486 |(?: # or the v= param in all its forms
f7000f3a 487 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af 488 (?:\?|\#!?) # the params delimiter ? or # or #!
040ac686 489 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
c5e8d7af
PH
490 v=
491 )
f4b05232 492 ))
cbaed4bb
S
493 |(?:
494 youtu\.be| # just youtu.be/xxxx
6d4fc66b
S
495 vid\.plus| # or vid.plus/xxxx
496 zwearz\.com/watch| # or zwearz.com/watch/xxxx
cbaed4bb 497 )/
edb53e2d 498 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 499 )
c5e8d7af 500 )? # all until now is optional -> you can pass the naked ID
8963d9c2 501 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
d0ba5587
S
502 (?!.*?\blist=
503 (?:
504 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
505 WL # WL are handled by the watch later IE
506 )
507 )
c5e8d7af 508 (?(1).+)? # if we found the ID, everything can follow
d0ba5587 509 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
c5e8d7af 510 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
e40c758c
S
511 _PLAYER_INFO_RE = (
512 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P<ext>[a-z]+)$',
513 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.(?P<ext>[a-z]+)$',
514 )
2c62dc26 515 _formats = {
c2d3cb4c 516 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
517 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
518 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
519 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
520 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
521 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
522 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
523 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
3834d3e3 524 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
c2d3cb4c 525 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
526 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
527 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
528 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
529 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
530 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
e1a0bfdf 531 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
c2d3cb4c 532 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
533 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
e1a0bfdf 534
535
536 # 3D videos
c2d3cb4c 537 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
538 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
539 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
540 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
e1a0bfdf 541 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
542 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
543 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
836a086c 544
96fb5605 545 # Apple HTTP Live Streaming
11f12195 546 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
c2d3cb4c 547 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
548 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
549 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
550 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
551 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
e1a0bfdf 552 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
553 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
2c62dc26
PH
554
555 # DASH mp4 video
d23028a8
S
556 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
557 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
558 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
559 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
560 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
067aa17e 561 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
d23028a8
S
562 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
563 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
564 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
565 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
566 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
567 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
836a086c 568
f6f1fc92 569 # Dash mp4 audio
d23028a8
S
570 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
571 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
572 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
573 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
574 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
575 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
576 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
836a086c
AZ
577
578 # Dash webm
d23028a8
S
579 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
580 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
581 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
582 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
583 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
584 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
585 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
586 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
587 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
588 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
589 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
590 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
591 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
592 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
593 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
4c6b4764 594 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
d23028a8
S
595 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
596 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
597 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
598 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
599 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
600 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
2c62dc26
PH
601
602 # Dash webm audio
d23028a8
S
603 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
604 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
ce6b9a2d 605
0857baad 606 # Dash webm audio with opus inside
d23028a8
S
607 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
608 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
609 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
0857baad 610
ce6b9a2d
PH
611 # RTMP (unnamed)
612 '_rtmp': {'protocol': 'rtmp'},
b85eae0f
S
613
614 # av01 video only formats sometimes served with "unknown" codecs
615 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
616 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
617 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
618 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
c5e8d7af 619 }
84da5d84 620 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
836a086c 621
fd5c4aab
S
622 _GEO_BYPASS = False
623
78caa52a 624 IE_NAME = 'youtube'
2eb88d95
PH
625 _TESTS = [
626 {
2d3d2997 627 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
4bc3a23e
PH
628 'info_dict': {
629 'id': 'BaW_jenozKc',
630 'ext': 'mp4',
3867038a 631 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
4bc3a23e
PH
632 'uploader': 'Philipp Hagemeister',
633 'uploader_id': 'phihag',
ec85ded8 634 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
dd4c4492
S
635 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
636 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
4bc3a23e 637 'upload_date': '20121002',
3867038a 638 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
4bc3a23e 639 'categories': ['Science & Technology'],
3867038a 640 'tags': ['youtube-dl'],
556dbe7f 641 'duration': 10,
dbdaaa23 642 'view_count': int,
3e7c1224
PH
643 'like_count': int,
644 'dislike_count': int,
7c80519c 645 'start_time': 1,
297a564b 646 'end_time': 9,
2eb88d95 647 }
0e853ca4 648 },
fccd3771 649 {
4bc3a23e
PH
650 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
651 'note': 'Embed-only video (#1746)',
652 'info_dict': {
653 'id': 'yZIXLfi8CZQ',
654 'ext': 'mp4',
655 'upload_date': '20120608',
656 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
657 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
658 'uploader': 'SET India',
94bfcd23 659 'uploader_id': 'setindia',
ec85ded8 660 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
94bfcd23 661 'age_limit': 18,
fccd3771
PH
662 }
663 },
11b56058 664 {
2d3d2997 665 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
11b56058
PM
666 'note': 'Use the first video ID in the URL',
667 'info_dict': {
668 'id': 'BaW_jenozKc',
669 'ext': 'mp4',
3867038a 670 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
11b56058
PM
671 'uploader': 'Philipp Hagemeister',
672 'uploader_id': 'phihag',
ec85ded8 673 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
11b56058 674 'upload_date': '20121002',
3867038a 675 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
11b56058 676 'categories': ['Science & Technology'],
3867038a 677 'tags': ['youtube-dl'],
556dbe7f 678 'duration': 10,
dbdaaa23 679 'view_count': int,
11b56058
PM
680 'like_count': int,
681 'dislike_count': int,
34a7de29
S
682 },
683 'params': {
684 'skip_download': True,
685 },
11b56058 686 },
dd27fd17 687 {
2d3d2997 688 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
4bc3a23e
PH
689 'note': '256k DASH audio (format 141) via DASH manifest',
690 'info_dict': {
691 'id': 'a9LDPn-MO4I',
692 'ext': 'm4a',
693 'upload_date': '20121002',
694 'uploader_id': '8KVIDEO',
ec85ded8 695 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
4bc3a23e
PH
696 'description': '',
697 'uploader': '8KVIDEO',
698 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 699 },
4bc3a23e
PH
700 'params': {
701 'youtube_include_dash_manifest': True,
702 'format': '141',
4919603f 703 },
de3c7fe0 704 'skip': 'format 141 not served anymore',
dd27fd17 705 },
aa79ac0c
PH
706 # Controversy video
707 {
708 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
709 'info_dict': {
710 'id': 'T4XJQO3qol8',
711 'ext': 'mp4',
556dbe7f 712 'duration': 219,
aa79ac0c 713 'upload_date': '20100909',
4fe54c12 714 'uploader': 'Amazing Atheist',
aa79ac0c 715 'uploader_id': 'TheAmazingAtheist',
ec85ded8 716 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
aa79ac0c
PH
717 'title': 'Burning Everyone\'s Koran',
718 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
719 }
c522adb1 720 },
dd2d55f1 721 # Normal age-gate video (embed allowed)
c522adb1 722 {
2d3d2997 723 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
c522adb1
JMF
724 'info_dict': {
725 'id': 'HtVdAasjOgU',
726 'ext': 'mp4',
727 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
ec85ded8 728 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
556dbe7f 729 'duration': 142,
c522adb1
JMF
730 'uploader': 'The Witcher',
731 'uploader_id': 'WitcherGame',
ec85ded8 732 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
c522adb1 733 'upload_date': '20140605',
34952f09 734 'age_limit': 18,
c522adb1
JMF
735 },
736 },
067aa17e 737 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
e52a40ab
PH
738 {
739 'url': 'lqQg6PlCWgI',
740 'info_dict': {
741 'id': 'lqQg6PlCWgI',
742 'ext': 'mp4',
556dbe7f 743 'duration': 6085,
90227264 744 'upload_date': '20150827',
cbe2bd91 745 'uploader_id': 'olympic',
ec85ded8 746 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
cbe2bd91 747 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
be49068d 748 'uploader': 'Olympic',
cbe2bd91
PH
749 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
750 },
751 'params': {
752 'skip_download': 'requires avconv',
e52a40ab 753 }
cbe2bd91 754 },
6271f1ca
PH
755 # Non-square pixels
756 {
757 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
758 'info_dict': {
759 'id': '_b-2C3KPAM0',
760 'ext': 'mp4',
761 'stretched_ratio': 16 / 9.,
556dbe7f 762 'duration': 85,
6271f1ca
PH
763 'upload_date': '20110310',
764 'uploader_id': 'AllenMeow',
ec85ded8 765 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
6271f1ca 766 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
eb6793ba 767 'uploader': '孫ᄋᄅ',
6271f1ca
PH
768 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
769 },
06b491eb
S
770 },
771 # url_encoded_fmt_stream_map is empty string
772 {
773 'url': 'qEJwOuvDf7I',
774 'info_dict': {
775 'id': 'qEJwOuvDf7I',
f57b7835 776 'ext': 'webm',
06b491eb
S
777 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
778 'description': '',
779 'upload_date': '20150404',
780 'uploader_id': 'spbelect',
781 'uploader': 'Наблюдатели Петербурга',
782 },
783 'params': {
784 'skip_download': 'requires avconv',
e323cf3f
S
785 },
786 'skip': 'This live event has ended.',
06b491eb 787 },
067aa17e 788 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
da77d856
S
789 {
790 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
791 'info_dict': {
792 'id': 'FIl7x6_3R5Y',
eb6793ba 793 'ext': 'webm',
da77d856
S
794 'title': 'md5:7b81415841e02ecd4313668cde88737a',
795 'description': 'md5:116377fd2963b81ec4ce64b542173306',
556dbe7f 796 'duration': 220,
da77d856
S
797 'upload_date': '20150625',
798 'uploader_id': 'dorappi2000',
ec85ded8 799 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
da77d856 800 'uploader': 'dorappi2000',
eb6793ba 801 'formats': 'mincount:31',
da77d856 802 },
eb6793ba 803 'skip': 'not actual anymore',
2ee8f5d8 804 },
8a1a26ce
YCH
805 # DASH manifest with segment_list
806 {
807 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
808 'md5': '8ce563a1d667b599d21064e982ab9e31',
809 'info_dict': {
810 'id': 'CsmdDsKjzN8',
811 'ext': 'mp4',
17ee98e1 812 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
8a1a26ce
YCH
813 'uploader': 'Airtek',
814 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
815 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
816 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
817 },
818 'params': {
819 'youtube_include_dash_manifest': True,
820 'format': '135', # bestvideo
be49068d
S
821 },
822 'skip': 'This live event has ended.',
2ee8f5d8 823 },
cf7e015f
S
824 {
825 # Multifeed videos (multiple cameras), URL is for Main Camera
826 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
827 'info_dict': {
828 'id': 'jqWvoWXjCVs',
829 'title': 'teamPGP: Rocket League Noob Stream',
830 'description': 'md5:dc7872fb300e143831327f1bae3af010',
831 },
832 'playlist': [{
833 'info_dict': {
834 'id': 'jqWvoWXjCVs',
835 'ext': 'mp4',
836 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
837 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 838 'duration': 7335,
cf7e015f
S
839 'upload_date': '20150721',
840 'uploader': 'Beer Games Beer',
841 'uploader_id': 'beergamesbeer',
ec85ded8 842 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 843 'license': 'Standard YouTube License',
cf7e015f
S
844 },
845 }, {
846 'info_dict': {
847 'id': '6h8e8xoXJzg',
848 'ext': 'mp4',
849 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
850 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 851 'duration': 7337,
cf7e015f
S
852 'upload_date': '20150721',
853 'uploader': 'Beer Games Beer',
854 'uploader_id': 'beergamesbeer',
ec85ded8 855 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 856 'license': 'Standard YouTube License',
cf7e015f
S
857 },
858 }, {
859 'info_dict': {
860 'id': 'PUOgX5z9xZw',
861 'ext': 'mp4',
862 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
863 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 864 'duration': 7337,
cf7e015f
S
865 'upload_date': '20150721',
866 'uploader': 'Beer Games Beer',
867 'uploader_id': 'beergamesbeer',
ec85ded8 868 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 869 'license': 'Standard YouTube License',
cf7e015f
S
870 },
871 }, {
872 'info_dict': {
873 'id': 'teuwxikvS5k',
874 'ext': 'mp4',
875 'title': 'teamPGP: Rocket League Noob Stream (zim)',
876 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 877 'duration': 7334,
cf7e015f
S
878 'upload_date': '20150721',
879 'uploader': 'Beer Games Beer',
880 'uploader_id': 'beergamesbeer',
ec85ded8 881 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 882 'license': 'Standard YouTube License',
cf7e015f
S
883 },
884 }],
885 'params': {
886 'skip_download': True,
887 },
4fe54c12 888 'skip': 'This video is not available.',
cbaed4bb 889 },
f9f49d87 890 {
067aa17e 891 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
f9f49d87
S
892 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
893 'info_dict': {
894 'id': 'gVfLd0zydlo',
895 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
896 },
897 'playlist_count': 2,
be49068d 898 'skip': 'Not multifeed anymore',
f9f49d87 899 },
cbaed4bb 900 {
2d3d2997 901 'url': 'https://vid.plus/FlRa-iH7PGw',
cbaed4bb 902 'only_matching': True,
0e49d9a6 903 },
6d4fc66b 904 {
2d3d2997 905 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
6d4fc66b
S
906 'only_matching': True,
907 },
0e49d9a6 908 {
067aa17e 909 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
a8776b10 910 # Also tests cut-off URL expansion in video description (see
067aa17e
S
911 # https://github.com/ytdl-org/youtube-dl/issues/1892,
912 # https://github.com/ytdl-org/youtube-dl/issues/8164)
0e49d9a6
LL
913 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
914 'info_dict': {
915 'id': 'lsguqyKfVQg',
916 'ext': 'mp4',
917 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
eb6793ba 918 'alt_title': 'Dark Walk - Position Music',
0e49d9a6 919 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
556dbe7f 920 'duration': 133,
0e49d9a6
LL
921 'upload_date': '20151119',
922 'uploader_id': 'IronSoulElf',
ec85ded8 923 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
0e49d9a6 924 'uploader': 'IronSoulElf',
eb6793ba
S
925 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
926 'track': 'Dark Walk - Position Music',
927 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
92bc97d3 928 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
0e49d9a6
LL
929 },
930 'params': {
931 'skip_download': True,
932 },
933 },
61f92af1 934 {
067aa17e 935 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
61f92af1
S
936 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
937 'only_matching': True,
938 },
313dfc45
LL
939 {
940 # Video with yt:stretch=17:0
941 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
942 'info_dict': {
943 'id': 'Q39EVAstoRM',
944 'ext': 'mp4',
945 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
946 'description': 'md5:ee18a25c350637c8faff806845bddee9',
947 'upload_date': '20151107',
948 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
949 'uploader': 'CH GAMER DROID',
950 },
951 'params': {
952 'skip_download': True,
953 },
be49068d 954 'skip': 'This video does not exist.',
313dfc45 955 },
7caf9830
S
956 {
957 # Video licensed under Creative Commons
958 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
959 'info_dict': {
960 'id': 'M4gD1WSo5mA',
961 'ext': 'mp4',
962 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
963 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
556dbe7f 964 'duration': 721,
7caf9830
S
965 'upload_date': '20150127',
966 'uploader_id': 'BerkmanCenter',
ec85ded8 967 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
556dbe7f 968 'uploader': 'The Berkman Klein Center for Internet & Society',
7caf9830
S
969 'license': 'Creative Commons Attribution license (reuse allowed)',
970 },
971 'params': {
972 'skip_download': True,
973 },
974 },
fd050249
S
975 {
976 # Channel-like uploader_url
977 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
978 'info_dict': {
979 'id': 'eQcmzGIKrzg',
980 'ext': 'mp4',
981 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
982 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
556dbe7f 983 'duration': 4060,
fd050249 984 'upload_date': '20151119',
eb6793ba 985 'uploader': 'Bernie Sanders',
fd050249 986 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
ec85ded8 987 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
fd050249
S
988 'license': 'Creative Commons Attribution license (reuse allowed)',
989 },
990 'params': {
991 'skip_download': True,
992 },
993 },
040ac686
S
994 {
995 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
996 'only_matching': True,
7f29cf54
S
997 },
998 {
067aa17e 999 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
7f29cf54
S
1000 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
1001 'only_matching': True,
6496ccb4
S
1002 },
1003 {
1004 # Rental video preview
1005 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
1006 'info_dict': {
1007 'id': 'uGpuVWrhIzE',
1008 'ext': 'mp4',
1009 'title': 'Piku - Trailer',
1010 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
1011 'upload_date': '20150811',
1012 'uploader': 'FlixMatrix',
1013 'uploader_id': 'FlixMatrixKaravan',
ec85ded8 1014 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
6496ccb4
S
1015 'license': 'Standard YouTube License',
1016 },
1017 'params': {
1018 'skip_download': True,
1019 },
eb6793ba 1020 'skip': 'This video is not available.',
022a5d66 1021 },
12afdc2a
S
1022 {
1023 # YouTube Red video with episode data
1024 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
1025 'info_dict': {
1026 'id': 'iqKdEhx-dD4',
1027 'ext': 'mp4',
1028 'title': 'Isolation - Mind Field (Ep 1)',
4fe54c12 1029 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
556dbe7f 1030 'duration': 2085,
12afdc2a
S
1031 'upload_date': '20170118',
1032 'uploader': 'Vsauce',
1033 'uploader_id': 'Vsauce',
1034 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
12afdc2a
S
1035 'series': 'Mind Field',
1036 'season_number': 1,
1037 'episode_number': 1,
1038 },
1039 'params': {
1040 'skip_download': True,
1041 },
1042 'expected_warnings': [
1043 'Skipping DASH manifest',
1044 ],
1045 },
c7121fa7
S
1046 {
1047 # The following content has been identified by the YouTube community
1048 # as inappropriate or offensive to some audiences.
1049 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
1050 'info_dict': {
1051 'id': '6SJNVb0GnPI',
1052 'ext': 'mp4',
1053 'title': 'Race Differences in Intelligence',
1054 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
1055 'duration': 965,
1056 'upload_date': '20140124',
1057 'uploader': 'New Century Foundation',
1058 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
1059 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
c7121fa7
S
1060 },
1061 'params': {
1062 'skip_download': True,
1063 },
1064 },
022a5d66
S
1065 {
1066 # itag 212
1067 'url': '1t24XAntNCY',
1068 'only_matching': True,
fd5c4aab
S
1069 },
1070 {
1071 # geo restricted to JP
1072 'url': 'sJL6WA-aGkQ',
1073 'only_matching': True,
1074 },
d0ba5587
S
1075 {
1076 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
1077 'only_matching': True,
1078 },
cd5a74a2
S
1079 {
1080 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1081 'only_matching': True,
1082 },
825cd268
RA
1083 {
1084 # DRM protected
1085 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1086 'only_matching': True,
4fe54c12
S
1087 },
1088 {
1089 # Video with unsupported adaptive stream type formats
1090 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1091 'info_dict': {
1092 'id': 'Z4Vy8R84T1U',
1093 'ext': 'mp4',
1094 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1095 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1096 'duration': 433,
1097 'upload_date': '20130923',
1098 'uploader': 'Amelia Putri Harwita',
1099 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1100 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1101 'formats': 'maxcount:10',
1102 },
1103 'params': {
1104 'skip_download': True,
1105 'youtube_include_dash_manifest': False,
1106 },
5429d6a9 1107 'skip': 'not actual anymore',
5caabd3c 1108 },
1109 {
822b9d9c 1110 # Youtube Music Auto-generated description
5caabd3c 1111 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1112 'info_dict': {
1113 'id': 'MgNrAu2pzNs',
1114 'ext': 'mp4',
1115 'title': 'Voyeur Girl',
1116 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1117 'upload_date': '20190312',
5429d6a9
S
1118 'uploader': 'Stephen - Topic',
1119 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
5caabd3c 1120 'artist': 'Stephen',
1121 'track': 'Voyeur Girl',
1122 'album': 'it\'s too much love to know my dear',
1123 'release_date': '20190313',
1124 'release_year': 2019,
1125 },
1126 'params': {
1127 'skip_download': True,
1128 },
1129 },
1130 {
822b9d9c 1131 # Youtube Music Auto-generated description
5caabd3c 1132 # Retrieve 'artist' field from 'Artist:' in video description
1133 # when it is present on youtube music video
5caabd3c 1134 'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY',
1135 'info_dict': {
1136 'id': 'k0jLE7tTwjY',
1137 'ext': 'mp4',
1138 'title': 'Latch Feat. Sam Smith',
1139 'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335',
1140 'upload_date': '20150110',
1141 'uploader': 'Various Artists - Topic',
1142 'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w',
1143 'artist': 'Disclosure',
1144 'track': 'Latch Feat. Sam Smith',
1145 'album': 'Latch Featuring Sam Smith',
1146 'release_date': '20121008',
1147 'release_year': 2012,
1148 },
1149 'params': {
1150 'skip_download': True,
1151 },
1152 },
1153 {
822b9d9c 1154 # Youtube Music Auto-generated description
5caabd3c 1155 # handle multiple artists on youtube music video
1156 'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA',
1157 'info_dict': {
1158 'id': '74qn0eJSjpA',
1159 'ext': 'mp4',
1160 'title': 'Eastside',
1161 'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2',
1162 'upload_date': '20180710',
1163 'uploader': 'Benny Blanco - Topic',
1164 'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A',
1165 'artist': 'benny blanco, Halsey, Khalid',
1166 'track': 'Eastside',
1167 'album': 'Eastside',
1168 'release_date': '20180713',
1169 'release_year': 2018,
1170 },
1171 'params': {
1172 'skip_download': True,
1173 },
1174 },
1175 {
822b9d9c 1176 # Youtube Music Auto-generated description
5caabd3c 1177 # handle youtube music video with release_year and no release_date
1178 'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M',
1179 'info_dict': {
1180 'id': '-hcAI0g-f5M',
1181 'ext': 'mp4',
1182 'title': 'Put It On Me',
5429d6a9 1183 'description': 'md5:f6422397c07c4c907c6638e1fee380a5',
5caabd3c 1184 'upload_date': '20180426',
1185 'uploader': 'Matt Maeson - Topic',
1186 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ',
1187 'artist': 'Matt Maeson',
1188 'track': 'Put It On Me',
1189 'album': 'The Hearse',
1190 'release_date': None,
1191 'release_year': 2018,
1192 },
1193 'params': {
1194 'skip_download': True,
1195 },
1196 },
66b48727
RA
1197 {
1198 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1199 'only_matching': True,
1200 },
011e75e6
S
1201 {
1202 # invalid -> valid video id redirection
1203 'url': 'DJztXj2GPfl',
1204 'info_dict': {
1205 'id': 'DJztXj2GPfk',
1206 'ext': 'mp4',
1207 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1208 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1209 'upload_date': '20090125',
1210 'uploader': 'Prochorowka',
1211 'uploader_id': 'Prochorowka',
1212 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1213 'artist': 'Panjabi MC',
1214 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1215 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1216 },
1217 'params': {
1218 'skip_download': True,
1219 },
ea74e00b
DP
1220 },
1221 {
1222 # empty description results in an empty string
1223 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1224 'info_dict': {
1225 'id': 'x41yOUIvK2k',
1226 'ext': 'mp4',
1227 'title': 'IMG 3456',
1228 'description': '',
1229 'upload_date': '20170613',
1230 'uploader_id': 'ElevageOrVert',
1231 'uploader': 'ElevageOrVert',
1232 },
1233 'params': {
1234 'skip_download': True,
1235 },
1236 },
2eb88d95
PH
1237 ]
1238
e0df6211
PH
1239 def __init__(self, *args, **kwargs):
1240 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 1241 self._player_cache = {}
e0df6211 1242
c5e8d7af
PH
1243 def report_video_info_webpage_download(self, video_id):
1244 """Report attempt to download video info webpage."""
69ea8ca4 1245 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 1246
c5e8d7af
PH
1247 def report_information_extraction(self, video_id):
1248 """Report attempt to extract video information."""
69ea8ca4 1249 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
1250
1251 def report_unavailable_format(self, video_id, format):
1252 """Report extracted video URL."""
69ea8ca4 1253 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
1254
1255 def report_rtmp_download(self):
1256 """Indicate the download will use the RTMP protocol."""
69ea8ca4 1257 self.to_screen('RTMP download detected')
c5e8d7af 1258
60064c53
PH
1259 def _signature_cache_id(self, example_sig):
1260 """ Return a string representation of a signature """
78caa52a 1261 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53 1262
e40c758c
S
1263 @classmethod
1264 def _extract_player_info(cls, player_url):
1265 for player_re in cls._PLAYER_INFO_RE:
1266 id_m = re.search(player_re, player_url)
1267 if id_m:
1268 break
1269 else:
c081b35c 1270 raise ExtractorError('Cannot identify player %r' % player_url)
e40c758c
S
1271 return id_m.group('ext'), id_m.group('id')
1272
1273 def _extract_signature_function(self, video_id, player_url, example_sig):
1274 player_type, player_id = self._extract_player_info(player_url)
e0df6211 1275
c4417ddb 1276 # Read from filesystem cache
60064c53
PH
1277 func_id = '%s_%s_%s' % (
1278 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 1279 assert os.path.basename(func_id) == func_id
a0e07d31 1280
69ea8ca4 1281 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 1282 if cache_spec is not None:
78caa52a 1283 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 1284
6d1a55a5
PH
1285 download_note = (
1286 'Downloading player %s' % player_url
1287 if self._downloader.params.get('verbose') else
1288 'Downloading %s player %s' % (player_type, player_id)
1289 )
e0df6211
PH
1290 if player_type == 'js':
1291 code = self._download_webpage(
1292 player_url, video_id,
6d1a55a5 1293 note=download_note,
69ea8ca4 1294 errnote='Download of %s failed' % player_url)
83799698 1295 res = self._parse_sig_js(code)
c4417ddb 1296 elif player_type == 'swf':
e0df6211
PH
1297 urlh = self._request_webpage(
1298 player_url, video_id,
6d1a55a5 1299 note=download_note,
69ea8ca4 1300 errnote='Download of %s failed' % player_url)
e0df6211 1301 code = urlh.read()
83799698 1302 res = self._parse_sig_swf(code)
e0df6211
PH
1303 else:
1304 assert False, 'Invalid player type %r' % player_type
1305
785521bf
PH
1306 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1307 cache_res = res(test_string)
1308 cache_spec = [ord(c) for c in cache_res]
83799698 1309
69ea8ca4 1310 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
1311 return res
1312
60064c53 1313 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
1314 def gen_sig_code(idxs):
1315 def _genslice(start, end, step):
78caa52a 1316 starts = '' if start == 0 else str(start)
8bcc8756 1317 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 1318 steps = '' if step == 1 else (':%d' % step)
78caa52a 1319 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
1320
1321 step = None
7af808a5
PH
1322 # Quelch pyflakes warnings - start will be set when step is set
1323 start = '(Never used)'
edf3e38e
PH
1324 for i, prev in zip(idxs[1:], idxs[:-1]):
1325 if step is not None:
1326 if i - prev == step:
1327 continue
1328 yield _genslice(start, prev, step)
1329 step = None
1330 continue
1331 if i - prev in [-1, 1]:
1332 step = i - prev
1333 start = prev
1334 continue
1335 else:
78caa52a 1336 yield 's[%d]' % prev
edf3e38e 1337 if step is None:
78caa52a 1338 yield 's[%d]' % i
edf3e38e
PH
1339 else:
1340 yield _genslice(start, i, step)
1341
78caa52a 1342 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 1343 cache_res = func(test_string)
edf3e38e 1344 cache_spec = [ord(c) for c in cache_res]
78caa52a 1345 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
1346 signature_id_tuple = '(%s)' % (
1347 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 1348 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 1349 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 1350 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 1351
e0df6211
PH
1352 def _parse_sig_js(self, jscode):
1353 funcname = self._search_regex(
abefc03f
S
1354 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1355 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
e450f6cb 1356 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
31ce6e99 1357 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
abefc03f
S
1358 # Obsolete patterns
1359 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
9a47fa35 1360 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
abefc03f
S
1361 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1362 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1363 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1364 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1365 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1366 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
3c90cc8b 1367 jscode, 'Initial JS player signature function name', group='sig')
2b25cb5d
PH
1368
1369 jsi = JSInterpreter(jscode)
1370 initial_function = jsi.extract_function(funcname)
e0df6211
PH
1371 return lambda s: initial_function([s])
1372
1373 def _parse_sig_swf(self, file_contents):
54256267 1374 swfi = SWFInterpreter(file_contents)
78caa52a 1375 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 1376 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 1377 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
1378 return lambda s: initial_function([s])
1379
83799698 1380 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 1381 """Turn the encrypted s field into a working signature"""
6b37f0be 1382
c8bf86d5 1383 if player_url is None:
69ea8ca4 1384 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 1385
69ea8ca4 1386 if player_url.startswith('//'):
78caa52a 1387 player_url = 'https:' + player_url
3c90cc8b
S
1388 elif not re.match(r'https?://', player_url):
1389 player_url = compat_urlparse.urljoin(
1390 'https://www.youtube.com', player_url)
c8bf86d5 1391 try:
62af3a0e 1392 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
1393 if player_id not in self._player_cache:
1394 func = self._extract_signature_function(
60064c53 1395 video_id, player_url, s
c8bf86d5
PH
1396 )
1397 self._player_cache[player_id] = func
1398 func = self._player_cache[player_id]
1399 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 1400 self._print_sig_code(func, s)
c8bf86d5
PH
1401 return func(s)
1402 except Exception as e:
1403 tb = traceback.format_exc()
1404 raise ExtractorError(
78caa52a 1405 'Signature extraction failed: ' + tb, cause=e)
e0df6211 1406
f96f5dda 1407 def _get_subtitles(self, video_id, webpage, has_live_chat_replay):
de7f3446 1408 try:
60e47a26 1409 subs_doc = self._download_xml(
38c2e5b8 1410 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
1411 video_id, note=False)
1412 except ExtractorError as err:
9b9c5355 1413 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
de7f3446 1414 return {}
de7f3446
JMF
1415
1416 sub_lang_list = {}
60e47a26
JMF
1417 for track in subs_doc.findall('track'):
1418 lang = track.attrib['lang_code']
7e660ac1
LD
1419 if lang in sub_lang_list:
1420 continue
360e1ca5 1421 sub_formats = []
23d17e4b 1422 for ext in self._SUBTITLE_FORMATS:
15707c7e 1423 params = compat_urllib_parse_urlencode({
360e1ca5
JMF
1424 'lang': lang,
1425 'v': video_id,
1426 'fmt': ext,
1427 'name': track.attrib['name'].encode('utf-8'),
1428 })
1429 sub_formats.append({
1430 'url': 'https://www.youtube.com/api/timedtext?' + params,
1431 'ext': ext,
1432 })
1433 sub_lang_list[lang] = sub_formats
9f448fcb 1434 if has_live_chat_replay:
321bf820 1435 sub_lang_list['live_chat'] = [
1436 {
1437 'video_id': video_id,
1438 'ext': 'json',
1439 'protocol': 'youtube_live_chat_replay',
1440 },
9f448fcb 1441 ]
de7f3446 1442 if not sub_lang_list:
69ea8ca4 1443 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
1444 return {}
1445 return sub_lang_list
1446
a72778d3
S
1447 def _get_ytplayer_config(self, video_id, webpage):
1448 patterns = (
526b3b07
S
1449 # User data may contain arbitrary character sequences that may affect
1450 # JSON extraction with regex, e.g. when '};' is contained the second
1451 # regex won't capture the whole JSON. Yet working around by trying more
1452 # concrete regex first keeping in mind proper quoted string handling
1453 # to be implemented in future that will replace this workaround (see
067aa17e
S
1454 # https://github.com/ytdl-org/youtube-dl/issues/7468,
1455 # https://github.com/ytdl-org/youtube-dl/pull/7599)
a72778d3
S
1456 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1457 r';ytplayer\.config\s*=\s*({.+?});',
59c5fa91 1458 r'ytInitialPlayerResponse\s*=\s*({.+?});var meta'
a72778d3
S
1459 )
1460 config = self._search_regex(
1461 patterns, webpage, 'ytplayer.config', default=None)
1462 if config:
1463 return self._parse_json(
1464 uppercase_escape(config), video_id, fatal=False)
0e49d9a6 1465
9322f116 1466 def _get_music_metadata_from_yt_initial(self, yt_initial):
1467 music_metadata = []
1468 key_map = {
1469 'Album': 'album',
1470 'Artist': 'artist',
1471 'Song': 'track'
1472 }
1473 contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'])
1474 if type(contents) is list:
1475 for content in contents:
1476 music_track = {}
1477 if type(content) is not dict:
1478 continue
1479 videoSecondaryInfoRenderer = try_get(content, lambda x: x['videoSecondaryInfoRenderer'])
1480 if type(videoSecondaryInfoRenderer) is not dict:
1481 continue
1482 rows = try_get(videoSecondaryInfoRenderer, lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'])
1483 if type(rows) is not list:
1484 continue
1485 for row in rows:
1486 metadataRowRenderer = try_get(row, lambda x: x['metadataRowRenderer'])
1487 if type(metadataRowRenderer) is not dict:
1488 continue
1489 key = try_get(metadataRowRenderer, lambda x: x['title']['simpleText'])
1490 value = try_get(metadataRowRenderer, lambda x: x['contents'][0]['simpleText']) or \
1491 try_get(metadataRowRenderer, lambda x: x['contents'][0]['runs'][0]['text'])
1492 if type(key) is not str or type(value) is not str:
1493 continue
1494 if key in key_map:
1495 if key_map[key] in music_track:
1496 # we've started on a new track
1497 music_metadata.append(music_track)
1498 music_track = {}
1499 music_track[key_map[key]] = value
1500 if len(music_track.keys()):
1501 music_metadata.append(music_track)
1502 return music_metadata
1503
360e1ca5 1504 def _get_automatic_captions(self, video_id, webpage):
de7f3446
JMF
1505 """We need the webpage for getting the captions url, pass it as an
1506 argument to speed up the process."""
69ea8ca4 1507 self.to_screen('%s: Looking for automatic captions' % video_id)
a72778d3 1508 player_config = self._get_ytplayer_config(video_id, webpage)
78caa52a 1509 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
a72778d3 1510 if not player_config:
de7f3446
JMF
1511 self._downloader.report_warning(err_msg)
1512 return {}
de7f3446 1513 try:
59c5fa91
PO
1514 if "args" in player_config and "ttsurl" in player_config["args"]:
1515 args = player_config['args']
1516 caption_url = args['ttsurl']
b78b292f 1517 timestamp = args['timestamp']
59c5fa91 1518
b78b292f 1519 # We get the available subtitles
15707c7e 1520 list_params = compat_urllib_parse_urlencode({
b78b292f
S
1521 'type': 'list',
1522 'tlangs': 1,
1523 'asrs': 1,
1524 })
1525 list_url = caption_url + '&' + list_params
1526 caption_list = self._download_xml(list_url, video_id)
1527 original_lang_node = caption_list.find('track')
1528 if original_lang_node is None:
1529 self._downloader.report_warning('Video doesn\'t have automatic captions')
1530 return {}
1531 original_lang = original_lang_node.attrib['lang_code']
1532 caption_kind = original_lang_node.attrib.get('kind', '')
1533
1534 sub_lang_list = {}
1535 for lang_node in caption_list.findall('target'):
1536 sub_lang = lang_node.attrib['lang_code']
1537 sub_formats = []
1538 for ext in self._SUBTITLE_FORMATS:
15707c7e 1539 params = compat_urllib_parse_urlencode({
b78b292f
S
1540 'lang': original_lang,
1541 'tlang': sub_lang,
1542 'fmt': ext,
1543 'ts': timestamp,
1544 'kind': caption_kind,
1545 })
1546 sub_formats.append({
1547 'url': caption_url + '&' + params,
1548 'ext': ext,
1549 })
1550 sub_lang_list[sub_lang] = sub_formats
1551 return sub_lang_list
1552
ddbb4c5c
S
1553 def make_captions(sub_url, sub_langs):
1554 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1555 caption_qs = compat_parse_qs(parsed_sub_url.query)
1556 captions = {}
1557 for sub_lang in sub_langs:
1558 sub_formats = []
1559 for ext in self._SUBTITLE_FORMATS:
1560 caption_qs.update({
1561 'tlang': [sub_lang],
1562 'fmt': [ext],
1563 })
1564 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1565 query=compat_urllib_parse_urlencode(caption_qs, True)))
1566 sub_formats.append({
1567 'url': sub_url,
1568 'ext': ext,
1569 })
1570 captions[sub_lang] = sub_formats
1571 return captions
1572
1573 # New captions format as of 22.06.2017
59c5fa91
PO
1574 if "args" in player_config:
1575 player_response = player_config["args"].get('player_response')
1576 else:
1577 # New player system (ytInitialPlayerResponse) as of October 2020
1578 player_response = player_config
1579
1580 if player_response:
1581 if isinstance(player_response, compat_str):
1582 player_response = self._parse_json(
1583 player_response, video_id, fatal=False)
1584
1585 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1586 caption_tracks = renderer['captionTracks']
1587 for caption_track in caption_tracks:
1588 if 'kind' not in caption_track:
1589 # not an automatic transcription
1590 continue
1591 base_url = caption_track['baseUrl']
1592 sub_lang_list = []
1593 for lang in renderer['translationLanguages']:
1594 lang_code = lang.get('languageCode')
1595 if lang_code:
1596 sub_lang_list.append(lang_code)
1597 return make_captions(base_url, sub_lang_list)
1598
1599 self._downloader.report_warning("Couldn't find automatic captions for %s" % video_id)
1600 return {}
1601
1602 if "args" in player_config:
1603 args = player_config["args"]
1604
1605 # Some videos don't provide ttsurl but rather caption_tracks and
1606 # caption_translation_languages (e.g. 20LmZk1hakA)
1607 # Does not used anymore as of 22.06.2017
1608 caption_tracks = args['caption_tracks']
1609 caption_translation_languages = args['caption_translation_languages']
1610 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
1611 sub_lang_list = []
1612 for lang in caption_translation_languages.split(','):
1613 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1614 sub_lang = lang_qs.get('lc', [None])[0]
1615 if sub_lang:
1616 sub_lang_list.append(sub_lang)
1617 return make_captions(caption_url, sub_lang_list)
de7f3446
JMF
1618 # An extractor error can be raise by the download process if there are
1619 # no automatic captions but there are subtitles
ddbb4c5c 1620 except (KeyError, IndexError, ExtractorError):
de7f3446
JMF
1621 self._downloader.report_warning(err_msg)
1622 return {}
1623
21c340b8
S
1624 def _mark_watched(self, video_id, video_info, player_response):
1625 playback_url = url_or_none(try_get(
1626 player_response,
1627 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
1628 video_info, lambda x: x['videostats_playback_base_url'][0]))
d77ab8e2
S
1629 if not playback_url:
1630 return
1631 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1632 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1633
1634 # cpn generation algorithm is reverse engineered from base.js.
1635 # In fact it works even with dummy cpn.
1636 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1637 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1638
1639 qs.update({
1640 'ver': ['2'],
1641 'cpn': [cpn],
1642 })
1643 playback_url = compat_urlparse.urlunparse(
15707c7e 1644 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
d77ab8e2
S
1645
1646 self._download_webpage(
1647 playback_url, video_id, 'Marking watched',
1648 'Unable to mark watched', fatal=False)
1649
66c9fa36
S
1650 @staticmethod
1651 def _extract_urls(webpage):
1652 # Embedded YouTube player
1653 entries = [
1654 unescapeHTML(mobj.group('url'))
1655 for mobj in re.finditer(r'''(?x)
1656 (?:
1657 <iframe[^>]+?src=|
1658 data-video-url=|
1659 <embed[^>]+?src=|
1660 embedSWF\(?:\s*|
1661 <object[^>]+data=|
1662 new\s+SWFObject\(
1663 )
1664 (["\'])
1665 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
f2332f18 1666 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
66c9fa36
S
1667 \1''', webpage)]
1668
1669 # lazyYT YouTube embed
1670 entries.extend(list(map(
1671 unescapeHTML,
1672 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1673
1674 # Wordpress "YouTube Video Importer" plugin
1675 matches = re.findall(r'''(?x)<div[^>]+
1676 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1677 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1678 entries.extend(m[-1] for m in matches)
1679
1680 return entries
1681
1682 @staticmethod
1683 def _extract_url(webpage):
1684 urls = YoutubeIE._extract_urls(webpage)
1685 return urls[0] if urls else None
1686
97665381
PH
1687 @classmethod
1688 def extract_id(cls, url):
1689 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 1690 if mobj is None:
69ea8ca4 1691 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
1692 video_id = mobj.group(2)
1693 return video_id
1694
84213ea8
S
1695 def _extract_chapters_from_json(self, webpage, video_id, duration):
1696 if not webpage:
1697 return
edd83104 1698 initial_data = self._parse_json(
84213ea8 1699 self._search_regex(
edd83104 1700 r'window\["ytInitialData"\] = (.+);\n', webpage,
84213ea8
S
1701 'player args', default='{}'),
1702 video_id, fatal=False)
edd83104 1703 if not initial_data or not isinstance(initial_data, dict):
84213ea8
S
1704 return
1705 chapters_list = try_get(
edd83104 1706 initial_data,
84213ea8
S
1707 lambda x: x['playerOverlays']
1708 ['playerOverlayRenderer']
1709 ['decoratedPlayerBarRenderer']
1710 ['decoratedPlayerBarRenderer']
1711 ['playerBar']
1712 ['chapteredPlayerBarRenderer']
1713 ['chapters'],
1714 list)
1715 if not chapters_list:
1716 return
1717
1718 def chapter_time(chapter):
1719 return float_or_none(
1720 try_get(
1721 chapter,
1722 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1723 int),
1724 scale=1000)
1725 chapters = []
1726 for next_num, chapter in enumerate(chapters_list, start=1):
1727 start_time = chapter_time(chapter)
1728 if start_time is None:
1729 continue
1730 end_time = (chapter_time(chapters_list[next_num])
1731 if next_num < len(chapters_list) else duration)
1732 if end_time is None:
1733 continue
1734 title = try_get(
1735 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1736 compat_str)
1737 chapters.append({
1738 'start_time': start_time,
1739 'end_time': end_time,
1740 'title': title,
1741 })
1742 return chapters
1743
9cafc3fd 1744 @staticmethod
84213ea8 1745 def _extract_chapters_from_description(description, duration):
9cafc3fd
S
1746 if not description:
1747 return None
1748 chapter_lines = re.findall(
1749 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1750 description)
1751 if not chapter_lines:
1752 return None
1753 chapters = []
1754 for next_num, (chapter_line, time_point) in enumerate(
1755 chapter_lines, start=1):
1756 start_time = parse_duration(time_point)
1757 if start_time is None:
1758 continue
39d4c1be
S
1759 if start_time > duration:
1760 break
9cafc3fd
S
1761 end_time = (duration if next_num == len(chapter_lines)
1762 else parse_duration(chapter_lines[next_num][1]))
1763 if end_time is None:
1764 continue
39d4c1be
S
1765 if end_time > duration:
1766 end_time = duration
1767 if start_time > end_time:
1768 break
9cafc3fd
S
1769 chapter_title = re.sub(
1770 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1771 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1772 chapters.append({
1773 'start_time': start_time,
1774 'end_time': end_time,
1775 'title': chapter_title,
1776 })
1777 return chapters
1778
84213ea8
S
1779 def _extract_chapters(self, webpage, description, video_id, duration):
1780 return (self._extract_chapters_from_json(webpage, video_id, duration)
1781 or self._extract_chapters_from_description(description, duration))
1782
c5e8d7af 1783 def _real_extract(self, url):
cf7e015f
S
1784 url, smuggled_data = unsmuggle_url(url, {})
1785
7e8c0af0 1786 proto = (
78caa52a
PH
1787 'http' if self._downloader.params.get('prefer_insecure', False)
1788 else 'https')
7e8c0af0 1789
7c80519c 1790 start_time = None
297a564b 1791 end_time = None
7c80519c
JMF
1792 parsed_url = compat_urllib_parse_urlparse(url)
1793 for component in [parsed_url.fragment, parsed_url.query]:
1794 query = compat_parse_qs(component)
297a564b 1795 if start_time is None and 't' in query:
7c80519c 1796 start_time = parse_duration(query['t'][0])
2929fa0e
JMF
1797 if start_time is None and 'start' in query:
1798 start_time = parse_duration(query['start'][0])
297a564b
JMF
1799 if end_time is None and 'end' in query:
1800 end_time = parse_duration(query['end'][0])
7c80519c 1801
c5e8d7af
PH
1802 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1803 mobj = re.search(self._NEXT_URL_RE, url)
1804 if mobj:
7fd002c0 1805 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
97665381 1806 video_id = self.extract_id(url)
c5e8d7af
PH
1807
1808 # Get video webpage
aa79ac0c 1809 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
011e75e6
S
1810 video_webpage, urlh = self._download_webpage_handle(url, video_id)
1811
1812 qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query)
1813 video_id = qs.get('v', [None])[0] or video_id
c5e8d7af
PH
1814
1815 # Attempt to extract SWF player URL
e0df6211 1816 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1817 if mobj is not None:
1818 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1819 else:
1820 player_url = None
1821
d8d24a92
S
1822 dash_mpds = []
1823
1824 def add_dash_mpd(video_info):
1825 dash_mpd = video_info.get('dashmpd')
1826 if dash_mpd and dash_mpd[0] not in dash_mpds:
1827 dash_mpds.append(dash_mpd[0])
1828
561b456e
S
1829 def add_dash_mpd_pr(pl_response):
1830 dash_mpd = url_or_none(try_get(
1831 pl_response, lambda x: x['streamingData']['dashManifestUrl'],
1832 compat_str))
1833 if dash_mpd and dash_mpd not in dash_mpds:
1834 dash_mpds.append(dash_mpd)
1835
c7121fa7
S
1836 is_live = None
1837 view_count = None
1838
1839 def extract_view_count(v_info):
1840 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1841
c2d125d9
S
1842 def extract_player_response(player_response, video_id):
1843 pl_response = str_or_none(player_response)
1844 if not pl_response:
1845 return
1846 pl_response = self._parse_json(pl_response, video_id, fatal=False)
1847 if isinstance(pl_response, dict):
1848 add_dash_mpd_pr(pl_response)
1849 return pl_response
1850
fb2c9277
U
1851 def extract_embedded_config(embed_webpage, video_id):
1852 embedded_config = self._search_regex(
1853 r'setConfig\(({.*})\);',
1854 embed_webpage, 'ytInitialData', default=None)
1855 if embedded_config:
1856 return embedded_config
1857
dbdaaa23
S
1858 player_response = {}
1859
c5e8d7af 1860 # Get video info
43ebf77d 1861 video_info = {}
6449cd80 1862 embed_webpage = None
39e7107d
U
1863 if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+'
1864 or re.search(r'player-age-gate-content">', video_webpage) is not None):
9d9314cb 1865 cookie_keys = self._get_cookies('https://www.youtube.com').keys()
c108eb73
JMF
1866 age_gate = True
1867 # We simulate the access to the video from www.youtube.com/v/{video_id}
1868 # this can be viewed without login into Youtube
beb95e77
CL
1869 url = proto + '://www.youtube.com/embed/%s' % video_id
1870 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
fb2c9277
U
1871 ext = extract_embedded_config(embed_webpage, video_id)
1872 # playabilityStatus = re.search(r'{\\\"status\\\":\\\"(?P<playabilityStatus>[^\"]+)\\\"', ext)
1873 playable_in_embed = re.search(r'{\\\"playableInEmbed\\\":(?P<playableinEmbed>[^\,]+)', ext)
1874 if not playable_in_embed:
1875 self.to_screen('Could not determine whether playabale in embed for video %s' % video_id)
1876 playable_in_embed = ''
1877 else:
1878 playable_in_embed = playable_in_embed.group('playableinEmbed')
1879 # check if video is only playable on youtube in other words not playable in embed - if so it requires auth (cookies)
1880 # if re.search(r'player-unavailable">', embed_webpage) is not None:
1881 if playable_in_embed == 'false':
c73baf23
U
1882 '''
1883 # TODO apply this patch when Support for Python 2.6(!) and above drops
9d9314cb 1884 if ({'VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID'} <= cookie_keys
4bb9c880 1885 or {'VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO'} <= cookie_keys):
c73baf23
U
1886 '''
1887 if (set(('VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID')) <= set(cookie_keys)
1888 or set(('VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO')) <= set(cookie_keys)):
4bb9c880
U
1889 age_gate = False
1890 # Try looking directly into the video webpage
1891 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1892 if ytplayer_config:
59c5fa91
PO
1893 args = ytplayer_config.get("args")
1894 if args is not None:
1895 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1896 # Convert to the same format returned by compat_parse_qs
1897 video_info = dict((k, [v]) for k, v in args.items())
1898 add_dash_mpd(video_info)
1899 # Rental video is not rented but preview is available (e.g.
1900 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1901 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1902 if not video_info and args.get('ypc_vid'):
1903 return self.url_result(
1904 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1905 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1906 is_live = True
1907 if not player_response:
1908 player_response = extract_player_response(args.get('player_response'), video_id)
1909 elif not player_response:
1910 player_response = ytplayer_config
4bb9c880
U
1911 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1912 add_dash_mpd_pr(player_response)
9d9314cb
U
1913 else:
1914 raise ExtractorError('Video is age restricted and only playable on Youtube. Requires cookies!', expected=True)
1915 else:
1916 data = compat_urllib_parse_urlencode({
1917 'video_id': video_id,
1918 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1919 'sts': self._search_regex(
1920 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1921 })
1922 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1923 try:
1924 video_info_webpage = self._download_webpage(
1925 video_info_url, video_id,
1926 note='Refetching age-gated info webpage',
1927 errnote='unable to download video info webpage')
1928 except ExtractorError:
1929 video_info_webpage = None
1930 if video_info_webpage:
1931 video_info = compat_parse_qs(video_info_webpage)
1932 pl_response = video_info.get('player_response', [None])[0]
1933 player_response = extract_player_response(pl_response, video_id)
1934 add_dash_mpd(video_info)
1935 view_count = extract_view_count(video_info)
c108eb73
JMF
1936 else:
1937 age_gate = False
d8d24a92 1938 # Try looking directly into the video webpage
a72778d3 1939 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
59c5fa91
PO
1940 args = ytplayer_config.get("args")
1941 if args is not None:
4c76aa06 1942 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
d8d24a92
S
1943 # Convert to the same format returned by compat_parse_qs
1944 video_info = dict((k, [v]) for k, v in args.items())
1945 add_dash_mpd(video_info)
6496ccb4
S
1946 # Rental video is not rented but preview is available (e.g.
1947 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
067aa17e 1948 # https://github.com/ytdl-org/youtube-dl/issues/10532)
6496ccb4
S
1949 if not video_info and args.get('ypc_vid'):
1950 return self.url_result(
1951 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
2fe1ff85
JMF
1952 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1953 is_live = True
dbdaaa23 1954 if not player_response:
c2d125d9 1955 player_response = extract_player_response(args.get('player_response'), video_id)
59c5fa91
PO
1956 elif not player_response:
1957 player_response = ytplayer_config
0a3cf9ad 1958 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
561b456e 1959 add_dash_mpd_pr(player_response)
bbb7c3f7
YCH
1960
1961 def extract_unavailable_message():
0add33ab
S
1962 messages = []
1963 for tag, kind in (('h1', 'message'), ('div', 'submessage')):
1964 msg = self._html_search_regex(
1965 r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
1966 video_webpage, 'unavailable %s' % kind, default=None)
1967 if msg:
1968 messages.append(msg)
1969 if messages:
1970 return '\n'.join(messages)
bbb7c3f7 1971
f93abcf1 1972 if not video_info and not player_response:
15be3eb5
RA
1973 unavailable_message = extract_unavailable_message()
1974 if not unavailable_message:
1975 unavailable_message = 'Unable to extract video data'
1976 raise ExtractorError(
1977 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
1978
f93abcf1
S
1979 if not isinstance(video_info, dict):
1980 video_info = {}
1981
dbdaaa23
S
1982 video_details = try_get(
1983 player_response, lambda x: x['videoDetails'], dict) or {}
1984
37357d21
S
1985 microformat = try_get(
1986 player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}
1987
8dbf751a
RA
1988 video_title = video_info.get('title', [None])[0] or video_details.get('title')
1989 if not video_title:
cf7e015f
S
1990 self._downloader.report_warning('Unable to extract video title')
1991 video_title = '_'
1992
9cafc3fd 1993 description_original = video_description = get_element_by_id("eow-description", video_webpage)
cf7e015f 1994 if video_description:
fa4bc6e7
RA
1995
1996 def replace_url(m):
1997 redir_url = compat_urlparse.urljoin(url, m.group(1))
1998 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1999 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
2000 qs = compat_parse_qs(parsed_redir_url.query)
2001 q = qs.get('q')
2002 if q and q[0]:
2003 return q[0]
2004 return redir_url
2005
9cafc3fd 2006 description_original = video_description = re.sub(r'''(?x)
cf7e015f 2007 <a\s+
25cb7a0e 2008 (?:[a-zA-Z-]+="[^"]*"\s+)*?
23f13e97 2009 (?:title|href)="([^"]+)"\s+
25cb7a0e 2010 (?:[a-zA-Z-]+="[^"]*"\s+)*?
525cedb9 2011 class="[^"]*"[^>]*>
23f13e97 2012 [^<]+\.{3}\s*
cf7e015f 2013 </a>
fa4bc6e7 2014 ''', replace_url, video_description)
cf7e015f
S
2015 video_description = clean_html(video_description)
2016 else:
ea74e00b
DP
2017 video_description = video_details.get('shortDescription')
2018 if video_description is None:
2019 video_description = self._html_search_meta('description', video_webpage)
cf7e015f 2020
8fe10494 2021 if not smuggled_data.get('force_singlefeed', False):
5e1eddb9 2022 if not self._downloader.params.get('noplaylist'):
8fe10494
S
2023 multifeed_metadata_list = try_get(
2024 player_response,
2025 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
2026 compat_str) or try_get(
2027 video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
2028 if multifeed_metadata_list:
2029 entries = []
2030 feed_ids = []
2031 for feed in multifeed_metadata_list.split(','):
2032 # Unquote should take place before split on comma (,) since textual
2033 # fields may contain comma as well (see
067aa17e 2034 # https://github.com/ytdl-org/youtube-dl/issues/8536)
8fe10494 2035 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
6b09401b
S
2036
2037 def feed_entry(name):
2038 return try_get(feed_data, lambda x: x[name][0], compat_str)
2039
2040 feed_id = feed_entry('id')
2041 if not feed_id:
2042 continue
2043 feed_title = feed_entry('title')
2044 title = video_title
2045 if feed_title:
2046 title += ' (%s)' % feed_title
8fe10494
S
2047 entries.append({
2048 '_type': 'url_transparent',
2049 'ie_key': 'Youtube',
2050 'url': smuggle_url(
2051 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
2052 {'force_singlefeed': True}),
6b09401b 2053 'title': title,
8fe10494 2054 })
6b09401b 2055 feed_ids.append(feed_id)
8fe10494
S
2056 self.to_screen(
2057 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
2058 % (', '.join(feed_ids), video_id))
2059 return self.playlist_result(entries, video_id, video_title, video_description)
2060 else:
2061 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
cf7e015f 2062
c7121fa7 2063 if view_count is None:
1c9c8de2 2064 view_count = extract_view_count(video_info)
dbdaaa23
S
2065 if view_count is None and video_details:
2066 view_count = int_or_none(video_details.get('viewCount'))
7b16239a
S
2067 if view_count is None and microformat:
2068 view_count = int_or_none(microformat.get('viewCount'))
1d699755 2069
27019dbb 2070 if is_live is None:
898238e9 2071 is_live = bool_or_none(video_details.get('isLive'))
27019dbb 2072
321bf820 2073 has_live_chat_replay = False
f0f76a33 2074 if not is_live:
321bf820 2075 yt_initial_data = self._get_yt_initial_data(video_id, video_webpage)
2076 try:
2077 yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
2078 has_live_chat_replay = True
f0f76a33 2079 except (KeyError, IndexError, TypeError):
321bf820 2080 pass
2081
c5e8d7af
PH
2082 # Check for "rental" videos
2083 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
067aa17e 2084 raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
c5e8d7af 2085
c63ca0ee
S
2086 def _extract_filesize(media_url):
2087 return int_or_none(self._search_regex(
2088 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
2089
bf1317d2
S
2090 streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
2091 streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
2092
c5e8d7af
PH
2093 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
2094 self.report_rtmp_download()
dd27fd17
PH
2095 formats = [{
2096 'format_id': '_rtmp',
2097 'protocol': 'rtmp',
2098 'url': video_info['conn'][0],
2099 'player_url': player_url,
2100 }]
bf1317d2 2101 elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
5f6a1245 2102 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 2103 if 'rtmpe%3Dyes' in encoded_url_map:
067aa17e 2104 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
bf1317d2 2105 formats = []
3318832e 2106 formats_spec = {}
82156fdb 2107 fmt_list = video_info.get('fmt_list', [''])[0]
2108 if fmt_list:
2109 for fmt in fmt_list.split(','):
2110 spec = fmt.split('/')
3318832e 2111 if len(spec) > 1:
2112 width_height = spec[1].split('x')
2113 if len(width_height) == 2:
2114 formats_spec[spec[0]] = {
2115 'resolution': spec[1],
2116 'width': int_or_none(width_height[0]),
2117 'height': int_or_none(width_height[1]),
2118 }
bf1317d2
S
2119 for fmt in streaming_formats:
2120 itag = str_or_none(fmt.get('itag'))
2121 if not itag:
201e9eaa 2122 continue
bf1317d2
S
2123 quality = fmt.get('quality')
2124 quality_label = fmt.get('qualityLabel') or quality
2125 formats_spec[itag] = {
2126 'asr': int_or_none(fmt.get('audioSampleRate')),
2127 'filesize': int_or_none(fmt.get('contentLength')),
2128 'format_note': quality_label,
2129 'fps': int_or_none(fmt.get('fps')),
2130 'height': int_or_none(fmt.get('height')),
bf1317d2
S
2131 # bitrate for itag 43 is always 2147483647
2132 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
2133 'width': int_or_none(fmt.get('width')),
2134 }
2135
2136 for fmt in streaming_formats:
00eb865b 2137 if fmt.get('drmFamilies') or fmt.get('drm_families'):
bf1317d2
S
2138 continue
2139 url = url_or_none(fmt.get('url'))
2140
2141 if not url:
fa3db383 2142 cipher = fmt.get('cipher') or fmt.get('signatureCipher')
bf1317d2
S
2143 if not cipher:
2144 continue
2145 url_data = compat_parse_qs(cipher)
2146 url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
2147 if not url:
2148 continue
2149 else:
2150 cipher = None
2151 url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2152
2f483bc1
S
2153 stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
2154 # Unsupported FORMAT_STREAM_TYPE_OTF
2155 if stream_type == 3:
2156 continue
6449cd80 2157
bf1317d2
S
2158 format_id = fmt.get('itag') or url_data['itag'][0]
2159 if not format_id:
2160 continue
2161 format_id = compat_str(format_id)
a49eccdf 2162
bf1317d2
S
2163 if cipher:
2164 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
67b19799 2165 ASSETS_RE = r'(?:"assets":.+?"js":\s*("[^"]+"))|(?:"jsUrl":\s*("[^"]+"))'
bf1317d2
S
2166 jsplayer_url_json = self._search_regex(
2167 ASSETS_RE,
2168 embed_webpage if age_gate else video_webpage,
2169 'JS player URL (1)', default=None)
2170 if not jsplayer_url_json and not age_gate:
2171 # We need the embed website after all
2172 if embed_webpage is None:
2173 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
2174 embed_webpage = self._download_webpage(
2175 embed_url, video_id, 'Downloading embed webpage')
2176 jsplayer_url_json = self._search_regex(
2177 ASSETS_RE, embed_webpage, 'JS player URL')
2178
2179 player_url = json.loads(jsplayer_url_json)
cf010131 2180 if player_url is None:
bf1317d2
S
2181 player_url_json = self._search_regex(
2182 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
2183 video_webpage, 'age gate player URL')
2184 player_url = json.loads(player_url_json)
2185
2186 if 'sig' in url_data:
2187 url += '&signature=' + url_data['sig'][0]
2188 elif 's' in url_data:
2189 encrypted_sig = url_data['s'][0]
2190
2191 if self._downloader.params.get('verbose'):
2192 if player_url is None:
bf1317d2 2193 player_desc = 'unknown'
cf010131 2194 else:
e40c758c
S
2195 player_type, player_version = self._extract_player_info(player_url)
2196 player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version)
bf1317d2
S
2197 parts_sizes = self._signature_cache_id(encrypted_sig)
2198 self.to_screen('{%s} signature length %s, %s' %
2199 (format_id, parts_sizes, player_desc))
2200
2201 signature = self._decrypt_signature(
2202 encrypted_sig, video_id, player_url, age_gate)
2203 sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
2204 url += '&%s=%s' % (sp, signature)
201e9eaa
PH
2205 if 'ratebypass' not in url:
2206 url += '&ratebypass=yes'
c9afb51c 2207
94278f72
YCH
2208 dct = {
2209 'format_id': format_id,
2210 'url': url,
2211 'player_url': player_url,
2212 }
2213 if format_id in self._formats:
2214 dct.update(self._formats[format_id])
3318832e 2215 if format_id in formats_spec:
2216 dct.update(formats_spec[format_id])
94278f72 2217
aabc2be6 2218 # Some itags are not included in DASH manifest thus corresponding formats will
067aa17e 2219 # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).
aabc2be6
S
2220 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
2221 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
2222 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
94278f72 2223
bf1317d2
S
2224 if width is None:
2225 width = int_or_none(fmt.get('width'))
2226 if height is None:
2227 height = int_or_none(fmt.get('height'))
2228
c63ca0ee
S
2229 filesize = int_or_none(url_data.get(
2230 'clen', [None])[0]) or _extract_filesize(url)
2231
bf1317d2
S
2232 quality = url_data.get('quality', [None])[0] or fmt.get('quality')
2233 quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
2234
4878759f
S
2235 tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
2236 or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
bf1317d2 2237 fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
54fc90aa 2238
94278f72 2239 more_fields = {
c63ca0ee 2240 'filesize': filesize,
bf1317d2 2241 'tbr': tbr,
c9afb51c
AH
2242 'width': width,
2243 'height': height,
bf1317d2
S
2244 'fps': fps,
2245 'format_note': quality_label or quality,
c9afb51c 2246 }
94278f72
YCH
2247 for key, value in more_fields.items():
2248 if value:
2249 dct[key] = value
bf1317d2 2250 type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
aabc2be6
S
2251 if type_:
2252 type_split = type_.split(';')
2253 kind_ext = type_split[0].split('/')
2254 if len(kind_ext) == 2:
94278f72
YCH
2255 kind, _ = kind_ext
2256 dct['ext'] = mimetype2ext(type_split[0])
aabc2be6
S
2257 if kind in ('audio', 'video'):
2258 codecs = None
2259 for mobj in re.finditer(
2260 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
2261 if mobj.group('key') == 'codecs':
2262 codecs = mobj.group('val')
2263 break
2264 if codecs:
6310acf5 2265 dct.update(parse_codecs(codecs))
e4a60912
S
2266 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
2267 dct['downloader_options'] = {
2268 # Youtube throttles chunks >~10M
2269 'http_chunk_size': 10485760,
2270 }
aabc2be6 2271 formats.append(dct)
c5e8d7af 2272 else:
c3e54389
S
2273 manifest_url = (
2274 url_or_none(try_get(
2275 player_response,
2276 lambda x: x['streamingData']['hlsManifestUrl'],
3089bc74
S
2277 compat_str))
2278 or url_or_none(try_get(
c3e54389
S
2279 video_info, lambda x: x['hlsvp'][0], compat_str)))
2280 if manifest_url:
2281 formats = []
2282 m3u8_formats = self._extract_m3u8_formats(
2283 manifest_url, video_id, 'mp4', fatal=False)
2284 for a_format in m3u8_formats:
2285 itag = self._search_regex(
2286 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
2287 if itag:
2288 a_format['format_id'] = itag
2289 if itag in self._formats:
2290 dct = self._formats[itag].copy()
2291 dct.update(a_format)
2292 a_format = dct
2293 a_format['player_url'] = player_url
2294 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
2295 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
78895bd3
U
2296 if self._downloader.params.get('youtube_include_hls_manifest', True):
2297 formats.append(a_format)
c3e54389 2298 else:
13577349 2299 error_message = extract_unavailable_message()
c3e54389 2300 if not error_message:
13577349
S
2301 error_message = clean_html(try_get(
2302 player_response, lambda x: x['playabilityStatus']['reason'],
2303 compat_str))
2304 if not error_message:
2305 error_message = clean_html(
2306 try_get(video_info, lambda x: x['reason'][0], compat_str))
c3e54389
S
2307 if error_message:
2308 raise ExtractorError(error_message, expected=True)
2309 raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 2310
7e72694b 2311 # uploader
dbdaaa23
S
2312 video_uploader = try_get(
2313 video_info, lambda x: x['author'][0],
2314 compat_str) or str_or_none(video_details.get('author'))
7e72694b
S
2315 if video_uploader:
2316 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
2317 else:
2318 self._downloader.report_warning('unable to extract uploader name')
2319
2320 # uploader_id
2321 video_uploader_id = None
2322 video_uploader_url = None
2323 mobj = re.search(
2324 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
2325 video_webpage)
2326 if mobj is not None:
2327 video_uploader_id = mobj.group('uploader_id')
2328 video_uploader_url = mobj.group('uploader_url')
a6211d23
S
2329 else:
2330 owner_profile_url = url_or_none(microformat.get('ownerProfileUrl'))
2331 if owner_profile_url:
2332 video_uploader_id = self._search_regex(
2333 r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id',
2334 default=None)
2335 video_uploader_url = owner_profile_url
7e72694b 2336
b45a9e69 2337 channel_id = (
3089bc74
S
2338 str_or_none(video_details.get('channelId'))
2339 or self._html_search_meta(
2340 'channelId', video_webpage, 'channel id', default=None)
2341 or self._search_regex(
b45a9e69 2342 r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
2343 video_webpage, 'channel id', default=None, group='id'))
dd4c4492
S
2344 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
2345
b477fc13
S
2346 thumbnails = []
2347 thumbnails_list = try_get(
2348 video_details, lambda x: x['thumbnail']['thumbnails'], list) or []
2349 for t in thumbnails_list:
2350 if not isinstance(t, dict):
2351 continue
2352 thumbnail_url = url_or_none(t.get('url'))
2353 if not thumbnail_url:
2354 continue
2355 thumbnails.append({
2356 'url': thumbnail_url,
2357 'width': int_or_none(t.get('width')),
2358 'height': int_or_none(t.get('height')),
2359 })
2360
2361 if not thumbnails:
7e72694b 2362 video_thumbnail = None
b477fc13
S
2363 # We try first to get a high quality image:
2364 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
2365 video_webpage, re.DOTALL)
2366 if m_thumb is not None:
2367 video_thumbnail = m_thumb.group(1)
2368 thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str)
2369 if thumbnail_url:
2370 video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url)
2371 if video_thumbnail:
2372 thumbnails.append({'url': video_thumbnail})
7e72694b
S
2373
2374 # upload date
2375 upload_date = self._html_search_meta(
2376 'datePublished', video_webpage, 'upload date', default=None)
2377 if not upload_date:
2378 upload_date = self._search_regex(
2379 [r'(?s)id="eow-date.*?>(.*?)</span>',
2380 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
2381 video_webpage, 'upload date', default=None)
37357d21
S
2382 if not upload_date:
2383 upload_date = microformat.get('publishDate') or microformat.get('uploadDate')
7e72694b
S
2384 upload_date = unified_strdate(upload_date)
2385
2386 video_license = self._html_search_regex(
2387 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
2388 video_webpage, 'license', default=None)
2389
2390 m_music = re.search(
2391 r'''(?x)
2392 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
2393 <ul[^>]*>\s*
2394 <li>(?P<title>.+?)
2395 by (?P<creator>.+?)
2396 (?:
2397 \(.+?\)|
2398 <a[^>]*
2399 (?:
2400 \bhref=["\']/red[^>]*>| # drop possible
2401 >\s*Listen ad-free with YouTube Red # YouTube Red ad
2402 )
2403 .*?
2404 )?</li
2405 ''',
2406 video_webpage)
2407 if m_music:
2408 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
2409 video_creator = clean_html(m_music.group('creator'))
2410 else:
2411 video_alt_title = video_creator = None
2412
2413 def extract_meta(field):
2414 return self._html_search_regex(
2415 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
2416 video_webpage, field, default=None)
2417
2418 track = extract_meta('Song')
2419 artist = extract_meta('Artist')
92bc97d3 2420 album = extract_meta('Album')
822b9d9c
RA
2421
2422 # Youtube Music Auto-generated description
92bc97d3 2423 release_date = release_year = None
822b9d9c
RA
2424 if video_description:
2425 mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description)
2426 if mobj:
2427 if not track:
2428 track = mobj.group('track').strip()
2429 if not artist:
2430 artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))
92bc97d3
RA
2431 if not album:
2432 album = mobj.group('album'.strip())
822b9d9c
RA
2433 release_year = mobj.group('release_year')
2434 release_date = mobj.group('release_date')
2435 if release_date:
2436 release_date = release_date.replace('-', '')
2437 if not release_year:
2438 release_year = int(release_date[:4])
2439 if release_year:
2440 release_year = int(release_year)
7e72694b 2441
9322f116 2442 yt_initial = self._get_yt_initial_data(video_id, video_webpage)
2443 if yt_initial:
2444 music_metadata = self._get_music_metadata_from_yt_initial(yt_initial)
2445 if len(music_metadata):
2446 album = music_metadata[0].get('album')
2447 artist = music_metadata[0].get('artist')
2448 track = music_metadata[0].get('track')
2449
7e72694b
S
2450 m_episode = re.search(
2451 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
2452 video_webpage)
2453 if m_episode:
c2dd2dc0 2454 series = unescapeHTML(m_episode.group('series'))
7e72694b
S
2455 season_number = int(m_episode.group('season'))
2456 episode_number = int(m_episode.group('episode'))
2457 else:
2458 series = season_number = episode_number = None
2459
2460 m_cat_container = self._search_regex(
2461 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
2462 video_webpage, 'categories', default=None)
dbeafce5 2463 category = None
7e72694b
S
2464 if m_cat_container:
2465 category = self._html_search_regex(
2466 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
2467 default=None)
dbeafce5
S
2468 if not category:
2469 category = try_get(
2470 microformat, lambda x: x['category'], compat_str)
2471 video_categories = None if category is None else [category]
7e72694b
S
2472
2473 video_tags = [
2474 unescapeHTML(m.group('content'))
2475 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
dbeafce5
S
2476 if not video_tags:
2477 video_tags = try_get(video_details, lambda x: x['keywords'], list)
7e72694b
S
2478
2479 def _extract_count(count_name):
2480 return str_to_int(self._search_regex(
a6c666d0 2481 r'"accessibilityData":\{"label":"([\d,\w]+) %ss"\}'
7e72694b
S
2482 % re.escape(count_name),
2483 video_webpage, count_name, default=None))
2484
2485 like_count = _extract_count('like')
2486 dislike_count = _extract_count('dislike')
2487
dbdaaa23
S
2488 if view_count is None:
2489 view_count = str_to_int(self._search_regex(
2490 r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
2491 'view count', default=None))
2492
bf3c9326
S
2493 average_rating = (
2494 float_or_none(video_details.get('averageRating'))
2495 or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
2496
7e72694b 2497 # subtitles
321bf820 2498 video_subtitles = self.extract_subtitles(
2499 video_id, video_webpage, has_live_chat_replay)
7e72694b
S
2500 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
2501
2502 video_duration = try_get(
2503 video_info, lambda x: int_or_none(x['length_seconds'][0]))
dbdaaa23
S
2504 if not video_duration:
2505 video_duration = int_or_none(video_details.get('lengthSeconds'))
7e72694b
S
2506 if not video_duration:
2507 video_duration = parse_duration(self._html_search_meta(
2508 'duration', video_webpage, 'video duration'))
2509
b84071c0
JP
2510 # Get Subscriber Count of channel
2511 subscriber_count = parse_count(self._search_regex(
2512 r'"text":"([\d\.]+\w?) subscribers"',
2513 video_webpage,
2514 'subscriber count',
2515 default=None
2516 ))
2517
7e72694b
S
2518 # annotations
2519 video_annotations = None
2520 if self._downloader.params.get('writeannotations', False):
64b6a4e9
RA
2521 xsrf_token = self._search_regex(
2522 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2',
2523 video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
2524 invideo_url = try_get(
2525 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
2526 if xsrf_token and invideo_url:
2527 xsrf_field_name = self._search_regex(
2528 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
2529 video_webpage, 'xsrf field name',
2530 group='xsrf_field_name', default='session_token')
2531 video_annotations = self._download_webpage(
2532 self._proto_relative_url(invideo_url),
2533 video_id, note='Downloading annotations',
2534 errnote='Unable to download video annotations', fatal=False,
2535 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
7e72694b 2536
84213ea8 2537 chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
7e72694b 2538
dd27fd17 2539 # Look for the DASH manifest
203fb43f 2540 if self._downloader.params.get('youtube_include_dash_manifest', True):
77c6fb5b 2541 dash_mpd_fatal = True
8ff648e4 2542 for mpd_url in dash_mpds:
d8d24a92 2543 dash_formats = {}
774e208f 2544 try:
05d0d131
YCH
2545 def decrypt_sig(mobj):
2546 s = mobj.group(1)
2547 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2548 return '/signature/%s' % dec_s
2549
8ff648e4 2550 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2d2fa82d 2551
8ff648e4 2552 for df in self._extract_mpd_formats(
2553 mpd_url, video_id, fatal=dash_mpd_fatal,
2554 formats_dict=self._formats):
c63ca0ee
S
2555 if not df.get('filesize'):
2556 df['filesize'] = _extract_filesize(df['url'])
d8d24a92
S
2557 # Do not overwrite DASH format found in some previous DASH manifest
2558 if df['format_id'] not in dash_formats:
2559 dash_formats[df['format_id']] = df
77c6fb5b
S
2560 # Additional DASH manifests may end up in HTTP Error 403 therefore
2561 # allow them to fail without bug report message if we already have
2562 # some DASH manifest succeeded. This is temporary workaround to reduce
2563 # burst of bug reports until we figure out the reason and whether it
2564 # can be fixed at all.
2565 dash_mpd_fatal = False
774e208f
PH
2566 except (ExtractorError, KeyError) as e:
2567 self.report_warning(
2568 'Skipping DASH manifest: %r' % e, video_id)
d8d24a92 2569 if dash_formats:
04b3b3df
JMF
2570 # Remove the formats we found through non-DASH, they
2571 # contain less info and it can be wrong, because we use
2572 # fixed values (for example the resolution). See
067aa17e 2573 # https://github.com/ytdl-org/youtube-dl/issues/5774 for an
04b3b3df 2574 # example.
d80265cc 2575 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
d8d24a92 2576 formats.extend(dash_formats.values())
d80044c2 2577
6271f1ca
PH
2578 # Check for malformed aspect ratio
2579 stretched_m = re.search(
2580 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2581 video_webpage)
2582 if stretched_m:
313dfc45
LL
2583 w = float(stretched_m.group('w'))
2584 h = float(stretched_m.group('h'))
5faf9fed
S
2585 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2586 # We will only process correct ratios.
313dfc45 2587 if w > 0 and h > 0:
41f24c32 2588 ratio = w / h
313dfc45
LL
2589 for f in formats:
2590 if f.get('vcodec') != 'none':
2591 f['stretched_ratio'] = ratio
6271f1ca 2592
026fbedc 2593 if not formats:
43ebf77d
S
2594 if 'reason' in video_info:
2595 if 'The uploader has not made this video available in your country.' in video_info['reason']:
2596 regions_allowed = self._html_search_meta(
2597 'regionsAllowed', video_webpage, default=None)
2598 countries = regions_allowed.split(',') if regions_allowed else None
2599 self.raise_geo_restricted(
2600 msg=video_info['reason'][0], countries=countries)
2601 reason = video_info['reason'][0]
2602 if 'Invalid parameters' in reason:
2603 unavailable_message = extract_unavailable_message()
2604 if unavailable_message:
2605 reason = unavailable_message
2606 raise ExtractorError(
2607 'YouTube said: %s' % reason,
2608 expected=True, video_id=video_id)
2609 if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']):
2610 raise ExtractorError('This video is DRM protected.', expected=True)
0d297518 2611
4bcc7bd1 2612 self._sort_formats(formats)
4ea3be0a 2613
21c340b8 2614 self.mark_watched(video_id, video_info, player_response)
d77ab8e2 2615
4ea3be0a 2616 return {
8bcc8756
JW
2617 'id': video_id,
2618 'uploader': video_uploader,
2619 'uploader_id': video_uploader_id,
fd050249 2620 'uploader_url': video_uploader_url,
dd4c4492
S
2621 'channel_id': channel_id,
2622 'channel_url': channel_url,
8bcc8756 2623 'upload_date': upload_date,
7caf9830 2624 'license': video_license,
936784b2 2625 'creator': video_creator or artist,
8bcc8756 2626 'title': video_title,
936784b2 2627 'alt_title': video_alt_title or track,
b477fc13 2628 'thumbnails': thumbnails,
8bcc8756
JW
2629 'description': video_description,
2630 'categories': video_categories,
000b6b5a 2631 'tags': video_tags,
8bcc8756 2632 'subtitles': video_subtitles,
360e1ca5 2633 'automatic_captions': automatic_captions,
8bcc8756
JW
2634 'duration': video_duration,
2635 'age_limit': 18 if age_gate else 0,
2636 'annotations': video_annotations,
9cafc3fd 2637 'chapters': chapters,
7e8c0af0 2638 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 2639 'view_count': view_count,
4ea3be0a 2640 'like_count': like_count,
2641 'dislike_count': dislike_count,
bf3c9326 2642 'average_rating': average_rating,
8bcc8756 2643 'formats': formats,
2fe1ff85 2644 'is_live': is_live,
7c80519c 2645 'start_time': start_time,
297a564b 2646 'end_time': end_time,
12afdc2a
S
2647 'series': series,
2648 'season_number': season_number,
2649 'episode_number': episode_number,
936784b2
S
2650 'track': track,
2651 'artist': artist,
5caabd3c 2652 'album': album,
2653 'release_date': release_date,
2654 'release_year': release_year,
b84071c0 2655 'subscriber_count': subscriber_count,
4ea3be0a 2656 }
c5e8d7af 2657
5f6a1245 2658
8e7aad20 2659class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
78caa52a 2660 IE_DESC = 'YouTube.com playlists'
d67cc9fa 2661 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
2662 (?:https?://)?
2663 (?:\w+\.)?
c5e8d7af 2664 (?:
c0345b82 2665 (?:
66b48727 2666 youtube(?:kids)?\.com|
c0345b82
S
2667 invidio\.us
2668 )
2669 /
feaa5ad7 2670 (?:
87dadd45 2671 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))
feaa5ad7
S
2672 \? (?:.*?[&;])*? (?:p|a|list)=
2673 | p/
2674 )|
2675 youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
c5e8d7af 2676 )
d67cc9fa 2677 (
66b48727 2678 (?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)?[0-9A-Za-z-_]{10,}
5f6a1245 2679 # Top tracks, they can also include dots
d67cc9fa
JMF
2680 |(?:MC)[\w\.]*
2681 )
c5e8d7af
PH
2682 .*
2683 |
d0ba5587
S
2684 (%(playlist_id)s)
2685 )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
8d81f3e3 2686 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
351f37c0
S
2687 _VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&amp;(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?'
2688 _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})'
78caa52a 2689 IE_NAME = 'youtube:playlist'
7f4f0b21 2690 _YTM_PLAYLIST_PREFIX = 'RDCLAK5uy_'
2691 _YTM_CHANNEL_INFO = {
2692 'uploader': 'Youtube Music',
2693 'uploader_id': 'music', # or "UC-9-kyTW8ZkZNDHQJ6FgpwQ"
2694 'uploader_url': 'https://www.youtube.com/music'
2695 }
81127aa5 2696 _TESTS = [{
0e30a7b9 2697 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
81127aa5 2698 'info_dict': {
0e30a7b9 2699 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2700 'uploader': 'Sergey M.',
2701 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3867038a 2702 'title': 'youtube-dl public playlist',
81127aa5 2703 },
0e30a7b9 2704 'playlist_count': 1,
9291475f 2705 }, {
0e30a7b9 2706 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
9291475f 2707 'info_dict': {
0e30a7b9 2708 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2709 'uploader': 'Sergey M.',
2710 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
3867038a 2711 'title': 'youtube-dl empty playlist',
9291475f
PH
2712 },
2713 'playlist_count': 0,
2714 }, {
2715 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2716 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2717 'info_dict': {
2718 'title': '29C3: Not my department',
acf757f4 2719 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
13a75688
S
2720 'uploader': 'Christiaan008',
2721 'uploader_id': 'ChRiStIaAn008',
9291475f 2722 },
0e30a7b9 2723 'playlist_count': 96,
9291475f
PH
2724 }, {
2725 'note': 'issue #673',
2726 'url': 'PLBB231211A4F62143',
2727 'info_dict': {
f46a8702 2728 'title': '[OLD]Team Fortress 2 (Class-based LP)',
acf757f4 2729 'id': 'PLBB231211A4F62143',
13a75688
S
2730 'uploader': 'Wickydoo',
2731 'uploader_id': 'Wickydoo',
9291475f
PH
2732 },
2733 'playlist_mincount': 26,
2734 }, {
2735 'note': 'Large playlist',
2736 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
2737 'info_dict': {
2738 'title': 'Uploads from Cauchemar',
acf757f4 2739 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
13a75688
S
2740 'uploader': 'Cauchemar',
2741 'uploader_id': 'Cauchemar89',
9291475f
PH
2742 },
2743 'playlist_mincount': 799,
2744 }, {
2745 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2746 'info_dict': {
2747 'title': 'YDL_safe_search',
acf757f4 2748 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
9291475f
PH
2749 },
2750 'playlist_count': 2,
4201ba13 2751 'skip': 'This playlist is private',
ac7553d0
PH
2752 }, {
2753 'note': 'embedded',
2d3d2997 2754 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
ac7553d0
PH
2755 'playlist_count': 4,
2756 'info_dict': {
2757 'title': 'JODA15',
acf757f4 2758 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
13a75688
S
2759 'uploader': 'milan',
2760 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
ac7553d0 2761 }
87dadd45
S
2762 }, {
2763 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2764 'playlist_mincount': 485,
2765 'info_dict': {
13a75688 2766 'title': '2018 Chinese New Singles (11/6 updated)',
87dadd45 2767 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
13a75688
S
2768 'uploader': 'LBK',
2769 'uploader_id': 'sdragonfang',
87dadd45 2770 }
6b08cdf6
PH
2771 }, {
2772 'note': 'Embedded SWF player',
2d3d2997 2773 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
6b08cdf6
PH
2774 'playlist_count': 4,
2775 'info_dict': {
2776 'title': 'JODA7',
acf757f4 2777 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
13a75688
S
2778 },
2779 'skip': 'This playlist does not exist',
4b7df0d3
JMF
2780 }, {
2781 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2782 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2783 'info_dict': {
acf757f4
PH
2784 'title': 'Uploads from Interstellar Movie',
2785 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
13a75688
S
2786 'uploader': 'Interstellar Movie',
2787 'uploader_id': 'InterstellarMovie1',
4b7df0d3 2788 },
481cc733 2789 'playlist_mincount': 21,
dacb3a86
S
2790 }, {
2791 # Playlist URL that does not actually serve a playlist
2792 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2793 'info_dict': {
2794 'id': 'FqZTN594JQw',
2795 'ext': 'webm',
2796 'title': "Smiley's People 01 detective, Adventure Series, Action",
2797 'uploader': 'STREEM',
2798 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
ec85ded8 2799 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
dacb3a86
S
2800 'upload_date': '20150526',
2801 'license': 'Standard YouTube License',
2802 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2803 'categories': ['People & Blogs'],
2804 'tags': list,
dbdaaa23 2805 'view_count': int,
dacb3a86
S
2806 'like_count': int,
2807 'dislike_count': int,
2808 },
2809 'params': {
2810 'skip_download': True,
2811 },
13a75688 2812 'skip': 'This video is not available.',
dacb3a86 2813 'add_ie': [YoutubeIE.ie_key()],
481cc733
S
2814 }, {
2815 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
2816 'info_dict': {
2817 'id': 'yeWKywCrFtk',
2818 'ext': 'mp4',
2819 'title': 'Small Scale Baler and Braiding Rugs',
2820 'uploader': 'Backus-Page House Museum',
2821 'uploader_id': 'backuspagemuseum',
ec85ded8 2822 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
481cc733 2823 'upload_date': '20161008',
481cc733
S
2824 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
2825 'categories': ['Nonprofits & Activism'],
2826 'tags': list,
2827 'like_count': int,
2828 'dislike_count': int,
2829 },
2830 'params': {
2831 'noplaylist': True,
2832 'skip_download': True,
2833 },
2e18adec
S
2834 }, {
2835 # https://github.com/ytdl-org/youtube-dl/issues/21844
2836 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2837 'info_dict': {
2838 'title': 'Data Analysis with Dr Mike Pound',
2839 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2840 'uploader_id': 'Computerphile',
2841 'uploader': 'Computerphile',
2842 },
2843 'playlist_mincount': 11,
feaa5ad7
S
2844 }, {
2845 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
2846 'only_matching': True,
a6857510
S
2847 }, {
2848 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
2849 'only_matching': True,
409b9324
S
2850 }, {
2851 # music album playlist
2852 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
2853 'only_matching': True,
c0345b82
S
2854 }, {
2855 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',
2856 'only_matching': True,
66b48727
RA
2857 }, {
2858 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
2859 'only_matching': True,
81127aa5 2860 }]
c5e8d7af 2861
880e1c52
JMF
2862 def _real_initialize(self):
2863 self._login()
2864
351f37c0
S
2865 def extract_videos_from_page(self, page):
2866 ids_in_page = []
2867 titles_in_page = []
2868
2869 for item in re.findall(
2870 r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page):
2871 attrs = extract_attributes(item)
2872 video_id = attrs['data-video-id']
2873 video_title = unescapeHTML(attrs.get('data-title'))
2874 if video_title:
2875 video_title = video_title.strip()
2876 ids_in_page.append(video_id)
2877 titles_in_page.append(video_title)
2878
2879 # Fallback with old _VIDEO_RE
2880 self.extract_videos_from_page_impl(
2881 self._VIDEO_RE, page, ids_in_page, titles_in_page)
2882
2883 # Relaxed fallbacks
2884 self.extract_videos_from_page_impl(
2885 r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page,
2886 ids_in_page, titles_in_page)
2887 self.extract_videos_from_page_impl(
2888 r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page,
2889 ids_in_page, titles_in_page)
2890
2891 return zip(ids_in_page, titles_in_page)
2892
5b0a6a80 2893 def _extract_mix_ids_from_yt_initial(self, yt_initial):
2894 ids = []
5c15c1a0 2895 playlist_contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist']['contents'], list)
2896 if playlist_contents:
5b0a6a80 2897 for item in playlist_contents:
5c15c1a0 2898 videoId = try_get(item, lambda x: x['playlistPanelVideoRenderer']['videoId'], compat_str)
2899 if videoId:
5b0a6a80 2900 ids.append(videoId)
2901 return ids
2902
652cdaa2 2903 def _extract_mix(self, playlist_id):
99209c29 2904 # The mixes are generated from a single video
652cdaa2 2905 # the id of the playlist is just 'RD' + video_id
1b6182d8 2906 ids = []
15f6397c 2907 yt_initial = None
1b6182d8
JMF
2908 last_id = playlist_id[-11:]
2909 for n in itertools.count(1):
07af16b9 2910 url = 'https://www.youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
1b6182d8
JMF
2911 webpage = self._download_webpage(
2912 url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))
2913 new_ids = orderedSet(re.findall(
2914 r'''(?xs)data-video-username=".*?".*?
2915 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
2916 webpage))
5b0a6a80 2917
2918 # if no ids in html of page, try using embedded json
2919 if (len(new_ids) == 0):
2920 yt_initial = self._get_yt_initial_data(playlist_id, webpage)
2921 if yt_initial:
2922 new_ids = self._extract_mix_ids_from_yt_initial(yt_initial)
2923
1b6182d8
JMF
2924 # Fetch new pages until all the videos are repeated, it seems that
2925 # there are always 51 unique videos.
2926 new_ids = [_id for _id in new_ids if _id not in ids]
2927 if not new_ids:
2928 break
2929 ids.extend(new_ids)
2930 last_id = ids[-1]
2931
2932 url_results = self._ids_to_results(ids)
2933
bc2f773b 2934 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5 2935 title_span = (
3089bc74
S
2936 search_title('playlist-title')
2937 or search_title('title long-title')
2938 or search_title('title'))
76d1700b 2939 title = clean_html(title_span)
652cdaa2 2940
15f6397c 2941 if not title:
2942 title = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist']['title'], compat_str)
2943
652cdaa2
JMF
2944 return self.playlist_result(url_results, playlist_id, title)
2945
448830ce 2946 def _extract_playlist(self, playlist_id):
dbb94fb0
S
2947 url = self._TEMPLATE_URL % playlist_id
2948 page = self._download_webpage(url, playlist_id)
dbb94fb0 2949
067aa17e 2950 # the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604)
8bc0800d 2951 for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page):
39b62db1
YCH
2952 match = match.strip()
2953 # Check if the playlist exists or is private
4201ba13
S
2954 mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match)
2955 if mobj:
2956 reason = mobj.group('reason')
2957 message = 'This playlist %s' % reason
2958 if 'private' in reason:
2959 message += ', use --username or --netrc to access it'
2960 message += '.'
2961 raise ExtractorError(message, expected=True)
39b62db1
YCH
2962 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
2963 raise ExtractorError(
2964 'Invalid parameters. Maybe URL is incorrect.',
2965 expected=True)
2966 elif re.match(r'[^<]*Choose your language[^<]*', match):
2967 continue
2968 else:
2969 self.report_warning('Youtube gives an alert message: ' + match)
10c0e2d8 2970
dbb94fb0 2971 playlist_title = self._html_search_regex(
63b4295d 2972 r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
dacb3a86 2973 page, 'title', default=None)
c5e8d7af 2974
07aeced6 2975 _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref='
4e3f1f04 2976 uploader = self._html_search_regex(
07aeced6
S
2977 r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,
2978 page, 'uploader', default=None)
2979 mobj = re.search(
2980 r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE,
2981 page)
2982 if mobj:
2983 uploader_id = mobj.group('uploader_id')
2984 uploader_url = compat_urlparse.urljoin(url, mobj.group('path'))
2985 else:
2986 uploader_id = uploader_url = None
2987
dacb3a86
S
2988 has_videos = True
2989
2990 if not playlist_title:
2991 try:
2992 # Some playlist URLs don't actually serve a playlist (e.g.
2993 # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)
2994 next(self._entries(page, playlist_id))
2995 except StopIteration:
2996 has_videos = False
2997
07aeced6 2998 playlist = self.playlist_result(
dacb3a86 2999 self._entries(page, playlist_id), playlist_id, playlist_title)
07aeced6
S
3000 playlist.update({
3001 'uploader': uploader,
3002 'uploader_id': uploader_id,
3003 'uploader_url': uploader_url,
3004 })
7f4f0b21 3005 if playlist_id.startswith(self._YTM_PLAYLIST_PREFIX):
3006 playlist.update(self._YTM_CHANNEL_INFO)
07aeced6
S
3007
3008 return has_videos, playlist
c5e8d7af 3009
ebf1b291 3010 def _check_download_just_video(self, url, playlist_id):
448830ce
S
3011 # Check if it's a video-specific URL
3012 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
481cc733 3013 video_id = query_dict.get('v', [None])[0] or self._search_regex(
87dadd45 3014 r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url,
481cc733
S
3015 'video id', default=None)
3016 if video_id:
448830ce
S
3017 if self._downloader.params.get('noplaylist'):
3018 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
dacb3a86 3019 return video_id, self.url_result(video_id, 'Youtube', video_id=video_id)
448830ce
S
3020 else:
3021 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
dacb3a86
S
3022 return video_id, None
3023 return None, None
448830ce 3024
ebf1b291
S
3025 def _real_extract(self, url):
3026 # Extract playlist id
3027 mobj = re.match(self._VALID_URL, url)
3028 if mobj is None:
3029 raise ExtractorError('Invalid URL: %s' % url)
3030 playlist_id = mobj.group(1) or mobj.group(2)
3031
dacb3a86 3032 video_id, video = self._check_download_just_video(url, playlist_id)
ebf1b291
S
3033 if video:
3034 return video
3035
466a6145 3036 if playlist_id.startswith(('RD', 'UL', 'PU')):
7f4f0b21 3037 if not playlist_id.startswith(self._YTM_PLAYLIST_PREFIX):
659ddd7f 3038 # Mixes require a custom extraction process,
3039 # Youtube Music playlists act like normal playlists (with randomized order)
3040 return self._extract_mix(playlist_id)
448830ce 3041
dacb3a86
S
3042 has_videos, playlist = self._extract_playlist(playlist_id)
3043 if has_videos or not video_id:
3044 return playlist
3045
3046 # Some playlist URLs don't actually serve a playlist (see
067aa17e 3047 # https://github.com/ytdl-org/youtube-dl/issues/10537).
dacb3a86
S
3048 # Fallback to plain video extraction if there is a video id
3049 # along with playlist id.
3050 return self.url_result(video_id, 'Youtube', video_id=video_id)
448830ce 3051
c5e8d7af 3052
648e6a1f 3053class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
78caa52a 3054 IE_DESC = 'YouTube.com channels'
66b48727 3055 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie|kids)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)'
eb0f3e7e 3056 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
648e6a1f 3057 _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
78caa52a 3058 IE_NAME = 'youtube:channel'
cdc628a4
PH
3059 _TESTS = [{
3060 'note': 'paginated channel',
3061 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
3062 'playlist_mincount': 91,
acf757f4 3063 'info_dict': {
9170ca5b
JMF
3064 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
3065 'title': 'Uploads from lex will',
13a75688
S
3066 'uploader': 'lex will',
3067 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
acf757f4 3068 }
5c43afd4
JMF
3069 }, {
3070 'note': 'Age restricted channel',
3071 # from https://www.youtube.com/user/DeusExOfficial
3072 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',
3073 'playlist_mincount': 64,
3074 'info_dict': {
3075 'id': 'UUs0ifCMCm1icqRbqhUINa0w',
3076 'title': 'Uploads from Deus Ex',
13a75688
S
3077 'uploader': 'Deus Ex',
3078 'uploader_id': 'DeusExOfficial',
5c43afd4 3079 },
cd5a74a2
S
3080 }, {
3081 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',
3082 'only_matching': True,
66b48727
RA
3083 }, {
3084 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA',
3085 'only_matching': True,
cdc628a4 3086 }]
c5e8d7af 3087
e462474e
S
3088 @classmethod
3089 def suitable(cls, url):
f07e276a
S
3090 return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)
3091 else super(YoutubeChannelIE, cls).suitable(url))
e462474e 3092
9558dcec
S
3093 def _build_template_url(self, url, channel_id):
3094 return self._TEMPLATE_URL % channel_id
3095
c5e8d7af 3096 def _real_extract(self, url):
9ff67727 3097 channel_id = self._match_id(url)
c5e8d7af 3098
9558dcec 3099 url = self._build_template_url(url, channel_id)
386bdfa6
S
3100
3101 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
3102 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
3103 # otherwise fallback on channel by page extraction
3104 channel_page = self._download_webpage(
3105 url + '?view=57', channel_id,
3106 'Downloading channel page', fatal=False)
2b3c2546
PH
3107 if channel_page is False:
3108 channel_playlist_id = False
3109 else:
3110 channel_playlist_id = self._html_search_meta(
3111 'channelId', channel_page, 'channel id', default=None)
3112 if not channel_playlist_id:
73c4ac2c
S
3113 channel_url = self._html_search_meta(
3114 ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),
3115 channel_page, 'channel url', default=None)
3116 if channel_url:
3117 channel_playlist_id = self._search_regex(
3118 r'vnd\.youtube://user/([0-9A-Za-z_-]+)',
3119 channel_url, 'channel id', default=None)
386bdfa6
S
3120 if channel_playlist_id and channel_playlist_id.startswith('UC'):
3121 playlist_id = 'UU' + channel_playlist_id[2:]
d2a9de78
IK
3122 return self.url_result(
3123 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
386bdfa6 3124
60bf45c8 3125 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
31812a9e
PH
3126 autogenerated = re.search(r'''(?x)
3127 class="[^"]*?(?:
3128 channel-header-autogenerated-label|
3129 yt-channel-title-autogenerated
3130 )[^"]*"''', channel_page) is not None
c5e8d7af 3131
b9643eed
JMF
3132 if autogenerated:
3133 # The videos are contained in a single page
3134 # the ajax pages can't be used, they are empty
b82f815f 3135 entries = [
fb69240c
S
3136 self.url_result(
3137 video_id, 'Youtube', video_id=video_id,
3138 video_title=video_title)
8f02ad4f 3139 for video_id, video_title in self.extract_videos_from_page(channel_page)]
b82f815f
PH
3140 return self.playlist_result(entries, channel_id)
3141
73c4ac2c
S
3142 try:
3143 next(self._entries(channel_page, channel_id))
3144 except StopIteration:
3145 alert_message = self._html_search_regex(
3146 r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',
3147 channel_page, 'alert', default=None, group='alert')
3148 if alert_message:
3149 raise ExtractorError('Youtube said: %s' % alert_message, expected=True)
3150
648e6a1f 3151 return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
c5e8d7af
PH
3152
3153
eb0f3e7e 3154class YoutubeUserIE(YoutubeChannelIE):
78caa52a 3155 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
39e7107d 3156 _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9%-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_%-]+)'
9558dcec 3157 _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
78caa52a 3158 IE_NAME = 'youtube:user'
c5e8d7af 3159
cdc628a4
PH
3160 _TESTS = [{
3161 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
3162 'playlist_mincount': 320,
3163 'info_dict': {
73c4ac2c
S
3164 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
3165 'title': 'Uploads from The Linux Foundation',
13a75688
S
3166 'uploader': 'The Linux Foundation',
3167 'uploader_id': 'TheLinuxFoundation',
cdc628a4 3168 }
9558dcec
S
3169 }, {
3170 # Only available via https://www.youtube.com/c/12minuteathlete/videos
3171 # but not https://www.youtube.com/user/12minuteathlete/videos
3172 'url': 'https://www.youtube.com/c/12minuteathlete/videos',
3173 'playlist_mincount': 249,
3174 'info_dict': {
3175 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
3176 'title': 'Uploads from 12 Minute Athlete',
13a75688
S
3177 'uploader': '12 Minute Athlete',
3178 'uploader_id': 'the12minuteathlete',
9558dcec 3179 }
cdc628a4
PH
3180 }, {
3181 'url': 'ytuser:phihag',
3182 'only_matching': True,
daa0df9e
YCH
3183 }, {
3184 'url': 'https://www.youtube.com/c/gametrailers',
3185 'only_matching': True,
39e7107d
U
3186 }, {
3187 'url': 'https://www.youtube.com/c/Pawe%C5%82Zadro%C5%BCniak',
3188 'only_matching': True,
9558dcec
S
3189 }, {
3190 'url': 'https://www.youtube.com/gametrailers',
3191 'only_matching': True,
73c4ac2c 3192 }, {
0e879f43 3193 # This channel is not available, geo restricted to JP
73c4ac2c
S
3194 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',
3195 'only_matching': True,
cdc628a4
PH
3196 }]
3197
e3ea4790 3198 @classmethod
f4b05232 3199 def suitable(cls, url):
e3ea4790
JMF
3200 # Don't return True if the url can be extracted with other youtube
3201 # extractor, the regex would is too permissive and it would match.
f3a58d46 3202 other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)
3203 if any(ie.suitable(url) for ie in other_yt_ies):
5f6a1245
JW
3204 return False
3205 else:
3206 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 3207
9558dcec
S
3208 def _build_template_url(self, url, channel_id):
3209 mobj = re.match(self._VALID_URL, url)
3210 return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))
3211
b05654f0 3212
f07e276a
S
3213class YoutubeLiveIE(YoutubeBaseInfoExtractor):
3214 IE_DESC = 'YouTube.com live streams'
073d5bf5 3215 _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'
f07e276a
S
3216 IE_NAME = 'youtube:live'
3217
3218 _TESTS = [{
2d3d2997 3219 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
f07e276a
S
3220 'info_dict': {
3221 'id': 'a48o2S1cPoo',
3222 'ext': 'mp4',
3223 'title': 'The Young Turks - Live Main Show',
3224 'uploader': 'The Young Turks',
3225 'uploader_id': 'TheYoungTurks',
ec85ded8 3226 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
f07e276a
S
3227 'upload_date': '20150715',
3228 'license': 'Standard YouTube License',
3229 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
3230 'categories': ['News & Politics'],
3231 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
3232 'like_count': int,
3233 'dislike_count': int,
3234 },
3235 'params': {
3236 'skip_download': True,
3237 },
3238 }, {
2d3d2997 3239 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
f07e276a 3240 'only_matching': True,
c1b2a085
S
3241 }, {
3242 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
3243 'only_matching': True,
073d5bf5
S
3244 }, {
3245 'url': 'https://www.youtube.com/TheYoungTurks/live',
3246 'only_matching': True,
f07e276a
S
3247 }]
3248
3249 def _real_extract(self, url):
3250 mobj = re.match(self._VALID_URL, url)
3251 channel_id = mobj.group('id')
3252 base_url = mobj.group('base_url')
3253 webpage = self._download_webpage(url, channel_id, fatal=False)
3254 if webpage:
3255 page_type = self._og_search_property(
e7f3529f 3256 'type', webpage, 'page type', default='')
f07e276a
S
3257 video_id = self._html_search_meta(
3258 'videoId', webpage, 'video id', default=None)
e7f3529f
S
3259 if page_type.startswith('video') and video_id and re.match(
3260 r'^[0-9A-Za-z_-]{11}$', video_id):
f07e276a
S
3261 return self.url_result(video_id, YoutubeIE.ie_key())
3262 return self.url_result(base_url)
3263
3264
e462474e
S
3265class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
3266 IE_DESC = 'YouTube.com user/channel playlists'
e942cfd1 3267 _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel|c)/(?P<id>[^/]+)/playlists'
e462474e 3268 IE_NAME = 'youtube:playlists'
0c148415 3269
e568c223 3270 _TESTS = [{
2d3d2997 3271 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
0c148415
S
3272 'playlist_mincount': 4,
3273 'info_dict': {
3274 'id': 'ThirstForScience',
13a75688 3275 'title': 'ThirstForScience',
0c148415 3276 },
e568c223
S
3277 }, {
3278 # with "Load more" button
2d3d2997 3279 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
e568c223
S
3280 'playlist_mincount': 70,
3281 'info_dict': {
3282 'id': 'igorkle1',
3283 'title': 'Игорь Клейнер',
3284 },
e462474e
S
3285 }, {
3286 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',
3287 'playlist_mincount': 17,
3288 'info_dict': {
3289 'id': 'UCiU1dHvZObB2iP6xkJ__Icw',
3290 'title': 'Chem Player',
3291 },
13a75688 3292 'skip': 'Blocked',
e942cfd1
S
3293 }, {
3294 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
3295 'only_matching': True,
e568c223 3296 }]
0c148415
S
3297
3298
9833e7a0 3299class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistBaseInfoExtractor):
78caa52a 3300 IE_DESC = 'YouTube.com searches'
b4c08069
JMF
3301 # there doesn't appear to be a real limit, for example if you search for
3302 # 'python' you get more than 8.000.000 results
3303 _MAX_RESULTS = float('inf')
78caa52a 3304 IE_NAME = 'youtube:search'
b05654f0 3305 _SEARCH_KEY = 'ytsearch'
6c894ea1 3306 _SEARCH_PARAMS = None
9dd8e46a 3307 _TESTS = []
b05654f0 3308
6c894ea1
U
3309 def _entries(self, query, n):
3310 data = {
3311 'context': {
3312 'client': {
3313 'clientName': 'WEB',
3314 'clientVersion': '2.20201021.03.00',
3315 }
3316 },
3317 'query': query,
a22b2fd1 3318 }
6c894ea1
U
3319 if self._SEARCH_PARAMS:
3320 data['params'] = self._SEARCH_PARAMS
3321 total = 0
3322 for page_num in itertools.count(1):
3323 search = self._download_json(
3324 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
3325 video_id='query "%s"' % query,
3326 note='Downloading page %s' % page_num,
3327 errnote='Unable to download API page', fatal=False,
3328 data=json.dumps(data).encode('utf8'),
3329 headers={'content-type': 'application/json'})
3330 if not search:
b4c08069 3331 break
6c894ea1
U
3332 slr_contents = try_get(
3333 search,
3334 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
3335 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
3336 list)
3337 if not slr_contents:
a22b2fd1 3338 break
6c894ea1
U
3339 isr_contents = try_get(
3340 slr_contents,
3341 lambda x: x[0]['itemSectionRenderer']['contents'],
3342 list)
3343 if not isr_contents:
3344 break
3345 for content in isr_contents:
3346 if not isinstance(content, dict):
3347 continue
3348 video = content.get('videoRenderer')
3349 if not isinstance(video, dict):
3350 continue
3351 video_id = video.get('videoId')
3352 if not video_id:
3353 continue
3354 title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str)
3355 description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str)
3356 duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str))
3357 view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
3358 view_count = int_or_none(self._search_regex(
3359 r'^(\d+)', re.sub(r'\s', '', view_count_text),
3360 'view count', default=None))
3361 uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
3362 total += 1
3363 yield {
3364 '_type': 'url_transparent',
3365 'ie_key': YoutubeIE.ie_key(),
3366 'id': video_id,
3367 'url': video_id,
3368 'title': title,
3369 'description': description,
3370 'duration': duration,
3371 'view_count': view_count,
3372 'uploader': uploader,
3373 }
3374 if total == n:
3375 return
3376 token = try_get(
3377 slr_contents,
3378 lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
3379 compat_str)
3380 if not token:
3381 break
3382 data['continuation'] = token
b05654f0 3383
6c894ea1
U
3384 def _get_n_results(self, query, n):
3385 """Get a specified number of results for a query"""
3386 return self.playlist_result(self._entries(query, n), query)
75dff0ee 3387
c9ae7b95 3388
a3dd9248 3389class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 3390 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248 3391 _SEARCH_KEY = 'ytsearchdate'
78caa52a 3392 IE_DESC = 'YouTube.com searches, newest videos first'
6c894ea1 3393 _SEARCH_PARAMS = 'CAI%3D'
75dff0ee 3394
c9ae7b95 3395
9833e7a0 3396class YoutubeSearchURLIE(YoutubePlaylistBaseInfoExtractor):
78caa52a
PH
3397 IE_DESC = 'YouTube.com search URLs'
3398 IE_NAME = 'youtube:search_url'
d2c1f79f 3399 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4 3400 _TESTS = [{
3867038a 3401 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
cdc628a4
PH
3402 'playlist_mincount': 5,
3403 'info_dict': {
3867038a 3404 'title': 'youtube-dl test video',
cdc628a4 3405 }
d2c1f79f
S
3406 }, {
3407 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
3408 'only_matching': True,
cdc628a4 3409 }]
c9ae7b95 3410
9833e7a0
LR
3411 def _process_json_dict(self, obj, videos, c):
3412 if "videoId" in obj:
3413 videos.append(obj)
3414 return
e03b4f3e 3415
9833e7a0
LR
3416 if "nextContinuationData" in obj:
3417 c["continuation"] = obj["nextContinuationData"]
3418 return
e03b4f3e 3419
c9ae7b95
PH
3420 def _real_extract(self, url):
3421 mobj = re.match(self._VALID_URL, url)
7fd002c0 3422 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
c9ae7b95 3423 webpage = self._download_webpage(url, query)
104bfdd2 3424 return self.playlist_result(self._entries(webpage, query, max_pages=5), playlist_title=query)
c9ae7b95
PH
3425
3426
136dadde 3427class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
78caa52a 3428 IE_DESC = 'YouTube.com (multi-season) shows'
92519402 3429 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 3430 IE_NAME = 'youtube:show'
cdc628a4 3431 _TESTS = [{
4003bd82 3432 'url': 'https://www.youtube.com/show/airdisasters',
8801255d 3433 'playlist_mincount': 5,
cdc628a4
PH
3434 'info_dict': {
3435 'id': 'airdisasters',
3436 'title': 'Air Disasters',
3437 }
3438 }]
75dff0ee
JMF
3439
3440 def _real_extract(self, url):
136dadde
S
3441 playlist_id = self._match_id(url)
3442 return super(YoutubeShowIE, self)._real_extract(
3443 'https://www.youtube.com/show/%s/playlists' % playlist_id)
04cc9617
JMF
3444
3445
9833e7a0 3446class YoutubeFeedsInfoExtractor(YoutubePlaylistBaseInfoExtractor):
d7ae0639 3447 """
25f14e9f 3448 Base class for feed extractors
d7ae0639
JMF
3449 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
3450 """
b2e8bc1b 3451 _LOGIN_REQUIRED = True
d7ae0639
JMF
3452
3453 @property
3454 def IE_NAME(self):
78caa52a 3455 return 'youtube:%s' % self._FEED_NAME
04cc9617 3456
81f0259b 3457 def _real_initialize(self):
b2e8bc1b 3458 self._login()
81f0259b 3459
9833e7a0
LR
3460 def _process_entries(self, entries, seen):
3461 new_info = []
3462 for v in entries:
3463 v_id = try_get(v, lambda x: x['videoId'])
3464 if not v_id:
3465 continue
62c95fd5 3466
9833e7a0
LR
3467 have_video = False
3468 for old in seen:
3469 if old['videoId'] == v_id:
3470 have_video = True
3471 break
2bc43303 3472
9833e7a0
LR
3473 if not have_video:
3474 new_info.append(v)
3853309f 3475
9833e7a0
LR
3476 if not new_info:
3477 return
2bc43303 3478
9833e7a0
LR
3479 seen.extend(new_info)
3480 for video in new_info:
3481 yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=self._extract_title(video))
2bc43303 3482
3853309f
S
3483 def _real_extract(self, url):
3484 page = self._download_webpage(
3485 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
3486 self._PLAYLIST_TITLE)
9833e7a0
LR
3487 return self.playlist_result(self._entries(page, self._PLAYLIST_TITLE),
3488 playlist_title=self._PLAYLIST_TITLE)
25f14e9f
S
3489
3490
3491class YoutubeWatchLaterIE(YoutubePlaylistIE):
3492 IE_NAME = 'youtube:watchlater'
3493 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
92519402 3494 _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'
25f14e9f 3495
bc7a9cd8
S
3496 _TESTS = [{
3497 'url': 'https://www.youtube.com/playlist?list=WL',
3498 'only_matching': True,
3499 }, {
3500 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',
3501 'only_matching': True,
3502 }]
25f14e9f
S
3503
3504 def _real_extract(self, url):
7e5dc339 3505 _, video = self._check_download_just_video(url, 'WL')
ebf1b291
S
3506 if video:
3507 return video
dacb3a86
S
3508 _, playlist = self._extract_playlist('WL')
3509 return playlist
f459d170 3510
5f6a1245 3511
c626a3d9 3512class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 3513 IE_NAME = 'youtube:favorites'
f3a34072 3514 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
92519402 3515 _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
3516 _LOGIN_REQUIRED = True
3517
3518 def _real_extract(self, url):
3519 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 3520 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 3521 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
3522
3523
25f14e9f
S
3524class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3525 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
92519402 3526 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'
25f14e9f
S
3527 _FEED_NAME = 'recommended'
3528 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1ed5b5c9 3529
1ed5b5c9 3530
25f14e9f
S
3531class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
3532 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
92519402 3533 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
25f14e9f
S
3534 _FEED_NAME = 'subscriptions'
3535 _PLAYLIST_TITLE = 'Youtube Subscriptions'
1ed5b5c9 3536
1ed5b5c9 3537
25f14e9f
S
3538class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
3539 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
92519402 3540 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
25f14e9f
S
3541 _FEED_NAME = 'history'
3542 _PLAYLIST_TITLE = 'Youtube History'
1ed5b5c9
JMF
3543
3544
15870e90
PH
3545class YoutubeTruncatedURLIE(InfoExtractor):
3546 IE_NAME = 'youtube:truncated_url'
3547 IE_DESC = False # Do not list
975d35db 3548 _VALID_URL = r'''(?x)
b95aab84
PH
3549 (?:https?://)?
3550 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3551 (?:watch\?(?:
c4808c60 3552 feature=[a-z_]+|
b95aab84
PH
3553 annotation_id=annotation_[^&]+|
3554 x-yt-cl=[0-9]+|
c1708b89 3555 hl=[^&]*|
287be8c6 3556 t=[0-9]+
b95aab84
PH
3557 )?
3558 |
3559 attribution_link\?a=[^&]+
3560 )
3561 $
975d35db 3562 '''
15870e90 3563
c4808c60 3564 _TESTS = [{
2d3d2997 3565 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
c4808c60 3566 'only_matching': True,
dc2fc736 3567 }, {
2d3d2997 3568 'url': 'https://www.youtube.com/watch?',
dc2fc736 3569 'only_matching': True,
b95aab84
PH
3570 }, {
3571 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3572 'only_matching': True,
3573 }, {
3574 'url': 'https://www.youtube.com/watch?feature=foo',
3575 'only_matching': True,
c1708b89
PH
3576 }, {
3577 'url': 'https://www.youtube.com/watch?hl=en-GB',
3578 'only_matching': True,
287be8c6
PH
3579 }, {
3580 'url': 'https://www.youtube.com/watch?t=2372',
3581 'only_matching': True,
c4808c60
PH
3582 }]
3583
15870e90
PH
3584 def _real_extract(self, url):
3585 raise ExtractorError(
78caa52a
PH
3586 'Did you forget to quote the URL? Remember that & is a meta '
3587 'character in most shells, so you want to put the URL in quotes, '
3867038a 3588 'like youtube-dl '
2d3d2997 3589 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3867038a 3590 ' or simply youtube-dl BaW_jenozKc .',
15870e90 3591 expected=True)
772fd5cc
PH
3592
3593
3594class YoutubeTruncatedIDIE(InfoExtractor):
3595 IE_NAME = 'youtube:truncated_id'
3596 IE_DESC = False # Do not list
b95aab84 3597 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
3598
3599 _TESTS = [{
3600 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3601 'only_matching': True,
3602 }]
3603
3604 def _real_extract(self, url):
3605 video_id = self._match_id(url)
3606 raise ExtractorError(
3607 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3608 expected=True)