]> jfr.im git - yt-dlp.git/blame - youtube_dlc/extractor/youtube.py
manually set limit for youtubesearchurl
[yt-dlp.git] / youtube_dlc / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
d77ab8e2 9import random
c5e8d7af 10import re
42939b61 11import time
e0df6211 12import traceback
c5e8d7af 13
b05654f0 14from .common import InfoExtractor, SearchInfoExtractor
2b25cb5d 15from ..jsinterp import JSInterpreter
54256267 16from ..swfinterp import SWFInterpreter
4bb4a188 17from ..compat import (
edf3e38e 18 compat_chr,
f8c55c66 19 compat_HTTPError,
8d81f3e3 20 compat_kwargs,
c5e8d7af 21 compat_parse_qs,
7fd002c0
S
22 compat_urllib_parse_unquote,
23 compat_urllib_parse_unquote_plus,
15707c7e 24 compat_urllib_parse_urlencode,
7c80519c 25 compat_urllib_parse_urlparse,
7c61bd36 26 compat_urlparse,
c5e8d7af 27 compat_str,
4bb4a188
PH
28)
29from ..utils import (
27019dbb 30 bool_or_none,
c5e8d7af 31 clean_html,
9b9c5355 32 error_to_compat_str,
351f37c0 33 extract_attributes,
c5e8d7af 34 ExtractorError,
2d30521a 35 float_or_none,
4bb4a188
PH
36 get_element_by_attribute,
37 get_element_by_id,
dd27fd17 38 int_or_none,
9833e7a0 39 js_to_json,
94278f72 40 mimetype2ext,
4bb4a188 41 orderedSet,
6310acf5 42 parse_codecs,
b84071c0 43 parse_count,
7c80519c 44 parse_duration,
0cb58b02 45 remove_quotes,
3995d37d 46 remove_start,
cf7e015f 47 smuggle_url,
dbdaaa23 48 str_or_none,
c93d53f5 49 str_to_int,
556dbe7f 50 try_get,
c5e8d7af
PH
51 unescapeHTML,
52 unified_strdate,
cf7e015f 53 unsmuggle_url,
81c2f20b 54 uppercase_escape,
21c340b8 55 url_or_none,
6e6bc8da 56 urlencode_postdata,
c5e8d7af
PH
57)
58
5f6a1245 59
de7f3446 60class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
61 """Provide base functions for Youtube extractors"""
62 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
9303ce3e 63 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
e00eb564
S
64
65 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
3995d37d
S
66 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
67 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
e00eb564 68
b2e8bc1b
JMF
69 _NETRC_MACHINE = 'youtube'
70 # If True it will raise an error if no login info is provided
71 _LOGIN_REQUIRED = False
72
66b48727 73 _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}'
9833e7a0
LR
74 _INITIAL_DATA_RE = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});'
75 _YTCFG_DATA_RE = r"ytcfg.set\(({.*?})\)"
d0ba5587 76
d84b21b4
S
77 _YOUTUBE_CLIENT_HEADERS = {
78 'x-youtube-client-name': '1',
79 'x-youtube-client-version': '1.20200609.04.02',
80 }
81
b2e8bc1b 82 def _set_language(self):
810fb84d 83 self._set_cookie(
ee0b726c 84 '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
42939b61 85 # YouTube sets the expire time to about two months
810fb84d 86 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b 87
25f14e9f
S
88 def _ids_to_results(self, ids):
89 return [
90 self.url_result(vid_id, 'Youtube', video_id=vid_id)
91 for vid_id in ids]
92
b2e8bc1b 93 def _login(self):
83317f69 94 """
95 Attempt to log in to YouTube.
96 True is returned if successful or skipped.
97 False is returned if login failed.
98
99 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
100 """
68217024 101 username, password = self._get_login_info()
b2e8bc1b
JMF
102 # No authentication to be performed
103 if username is None:
70d35d16 104 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
69ea8ca4 105 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
957c523e
U
106 if self._downloader.params.get('cookiefile') and False: # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
107 self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
83317f69 108 return True
b2e8bc1b 109
7cc3570e
PH
110 login_page = self._download_webpage(
111 self._LOGIN_URL, None,
69ea8ca4
PH
112 note='Downloading login page',
113 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
114 if login_page is False:
115 return
b2e8bc1b 116
1212e997 117 login_form = self._hidden_inputs(login_page)
c5e8d7af 118
e00eb564
S
119 def req(url, f_req, note, errnote):
120 data = login_form.copy()
121 data.update({
122 'pstMsg': 1,
123 'checkConnection': 'youtube',
124 'checkedDomains': 'youtube',
125 'hl': 'en',
126 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
3995d37d 127 'f.req': json.dumps(f_req),
e00eb564
S
128 'flowName': 'GlifWebSignIn',
129 'flowEntry': 'ServiceLogin',
baf67a60
S
130 # TODO: reverse actual botguard identifier generation algo
131 'bgRequest': '["identifier",""]',
041bc3ad 132 })
e00eb564
S
133 return self._download_json(
134 url, None, note=note, errnote=errnote,
135 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
136 fatal=False,
137 data=urlencode_postdata(data), headers={
138 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
139 'Google-Accounts-XSRF': 1,
140 })
141
3995d37d
S
142 def warn(message):
143 self._downloader.report_warning(message)
144
145 lookup_req = [
146 username,
147 None, [], None, 'US', None, None, 2, False, True,
148 [
149 None, None,
150 [2, 1, None, 1,
151 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
152 None, [], 4],
153 1, [None, None, []], None, None, None, True
154 ],
155 username,
156 ]
157
e00eb564 158 lookup_results = req(
3995d37d 159 self._LOOKUP_URL, lookup_req,
e00eb564
S
160 'Looking up account info', 'Unable to look up account info')
161
162 if lookup_results is False:
163 return False
041bc3ad 164
3995d37d
S
165 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
166 if not user_hash:
167 warn('Unable to extract user hash')
168 return False
169
170 challenge_req = [
171 user_hash,
172 None, 1, None, [1, None, None, None, [password, None, True]],
173 [
174 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
175 1, [None, None, []], None, None, None, True
176 ]]
83317f69 177
3995d37d
S
178 challenge_results = req(
179 self._CHALLENGE_URL, challenge_req,
180 'Logging in', 'Unable to log in')
83317f69 181
3995d37d 182 if challenge_results is False:
e00eb564 183 return
83317f69 184
3995d37d
S
185 login_res = try_get(challenge_results, lambda x: x[0][5], list)
186 if login_res:
187 login_msg = try_get(login_res, lambda x: x[5], compat_str)
188 warn(
189 'Unable to login: %s' % 'Invalid password'
190 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
191 return False
192
193 res = try_get(challenge_results, lambda x: x[0][-1], list)
194 if not res:
195 warn('Unable to extract result entry')
196 return False
197
9a6628aa
S
198 login_challenge = try_get(res, lambda x: x[0][0], list)
199 if login_challenge:
200 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
201 if challenge_str == 'TWO_STEP_VERIFICATION':
3995d37d
S
202 # SEND_SUCCESS - TFA code has been successfully sent to phone
203 # QUOTA_EXCEEDED - reached the limit of TFA codes
9a6628aa 204 status = try_get(login_challenge, lambda x: x[5], compat_str)
3995d37d
S
205 if status == 'QUOTA_EXCEEDED':
206 warn('Exceeded the limit of TFA codes, try later')
207 return False
208
209 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
210 if not tl:
211 warn('Unable to extract TL')
212 return False
213
214 tfa_code = self._get_tfa_info('2-step verification code')
215
216 if not tfa_code:
217 warn(
218 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
219 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
220 return False
221
222 tfa_code = remove_start(tfa_code, 'G-')
223
224 tfa_req = [
225 user_hash, None, 2, None,
226 [
227 9, None, None, None, None, None, None, None,
228 [None, tfa_code, True, 2]
229 ]]
230
231 tfa_results = req(
232 self._TFA_URL.format(tl), tfa_req,
233 'Submitting TFA code', 'Unable to submit TFA code')
234
235 if tfa_results is False:
236 return False
237
238 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
239 if tfa_res:
240 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
241 warn(
242 'Unable to finish TFA: %s' % 'Invalid TFA code'
243 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
244 return False
245
246 check_cookie_url = try_get(
247 tfa_results, lambda x: x[0][-1][2], compat_str)
9a6628aa
S
248 else:
249 CHALLENGES = {
250 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
251 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
252 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
253 }
254 challenge = CHALLENGES.get(
255 challenge_str,
256 '%s returned error %s.' % (self.IE_NAME, challenge_str))
257 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
258 return False
3995d37d
S
259 else:
260 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
261
262 if not check_cookie_url:
263 warn('Unable to extract CheckCookie URL')
264 return False
e00eb564
S
265
266 check_cookie_results = self._download_webpage(
3995d37d
S
267 check_cookie_url, None, 'Checking cookie', fatal=False)
268
269 if check_cookie_results is False:
270 return False
e00eb564 271
3995d37d
S
272 if 'https://myaccount.google.com/' not in check_cookie_results:
273 warn('Unable to log in')
b2e8bc1b 274 return False
e00eb564 275
b2e8bc1b
JMF
276 return True
277
30226342 278 def _download_webpage_handle(self, *args, **kwargs):
c1148516 279 query = kwargs.get('query', {}).copy()
c1148516 280 kwargs['query'] = query
30226342 281 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
8d81f3e3
S
282 *args, **compat_kwargs(kwargs))
283
5b0a6a80 284 def _get_yt_initial_data(self, video_id, webpage):
285 config = self._search_regex(
286 (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
287 r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
288 webpage, 'ytInitialData', default=None)
289 if config:
290 return self._parse_json(
291 uppercase_escape(config), video_id, fatal=False)
292
b2e8bc1b
JMF
293 def _real_initialize(self):
294 if self._downloader is None:
295 return
42939b61 296 self._set_language()
b2e8bc1b
JMF
297 if not self._login():
298 return
c5e8d7af 299
8377574c 300
8e7aad20 301class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
9833e7a0
LR
302 def _find_entries_in_json(self, extracted):
303 entries = []
304 c = {}
305
306 def _real_find(obj):
307 if obj is None or isinstance(obj, str):
308 return
309
310 if type(obj) is list:
311 for elem in obj:
312 _real_find(elem)
313
314 if type(obj) is dict:
315 if self._is_entry(obj):
316 entries.append(obj)
317 return
318
319 if 'continuationCommand' in obj:
320 c['continuation'] = obj
321 return
322
323 for _, o in obj.items():
324 _real_find(o)
325
326 _real_find(extracted)
327
328 return entries, try_get(c, lambda x: x["continuation"])
329
0f8566e9 330 def _entries(self, page, playlist_id, n=1):
9833e7a0
LR
331 seen = []
332
333 yt_conf = {}
334 for m in re.finditer(self._YTCFG_DATA_RE, page):
335 parsed = self._parse_json(m.group(1), playlist_id,
336 transform_source=js_to_json, fatal=False)
337 if parsed:
338 yt_conf.update(parsed)
339
340 data_json = self._parse_json(self._search_regex(self._INITIAL_DATA_RE, page, 'ytInitialData'), None)
341
0f8566e9
U
342 # for page_num in itertools.count(1):
343 for page_num in range(n):
9833e7a0
LR
344 entries, continuation = self._find_entries_in_json(data_json)
345 processed = self._process_entries(entries, seen)
346
347 if not processed:
348 break
349 for entry in processed:
061a75ed 350 yield entry
648e6a1f 351
9833e7a0
LR
352 if not continuation or not yt_conf:
353 break
354 continuation_token = try_get(continuation, lambda x: x['continuationCommand']['token'])
355 continuation_url = try_get(continuation, lambda x: x['commandMetadata']['webCommandMetadata']['apiUrl'])
356 if not continuation_token or not continuation_url:
648e6a1f
S
357 break
358
f8c55c66
S
359 count = 0
360 retries = 3
361 while count <= retries:
362 try:
363 # Downloading page may result in intermittent 5xx HTTP error
364 # that is usually worked around with a retry
9833e7a0
LR
365 data_json = self._download_json(
366 'https://www.youtube.com%s' % continuation_url,
367 playlist_id,
368 'Downloading page #%s%s' % (page_num, ' (retry #%d)' % count if count else ''),
d84b21b4 369 transform_source=uppercase_escape,
9833e7a0
LR
370 query={
371 'key': try_get(yt_conf, lambda x: x['INNERTUBE_API_KEY'])
372 },
373 data=bytes(json.dumps({
374 'context': try_get(yt_conf, lambda x: x['INNERTUBE_CONTEXT']),
375 'continuation': continuation_token
376 }), encoding='utf-8'),
377 headers={
378 'Content-Type': 'application/json'
379 }
380 )
f8c55c66
S
381 break
382 except ExtractorError as e:
383 if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
384 count += 1
385 if count <= retries:
386 continue
387 raise
388
9833e7a0
LR
389 def _extract_title(self, renderer):
390 title = try_get(renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
391 if title:
392 return title
393 return try_get(renderer, lambda x: x['title']['simpleText'], compat_str)
648e6a1f 394
061a75ed
S
395
396class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
9833e7a0
LR
397 def _is_entry(self, obj):
398 return 'videoId' in obj
399
400 def _process_entries(self, entries, seen):
401 ids_in_page = []
402 titles_in_page = []
403 for renderer in entries:
404 video_id = try_get(renderer, lambda x: x['videoId'])
405 video_title = self._extract_title(renderer)
061a75ed 406
9833e7a0
LR
407 if video_id is None or video_title is None:
408 # we do not have a videoRenderer or title extraction broke
648e6a1f 409 continue
9833e7a0
LR
410
411 video_title = video_title.strip()
412
648e6a1f
S
413 try:
414 idx = ids_in_page.index(video_id)
415 if video_title and not titles_in_page[idx]:
416 titles_in_page[idx] = video_title
417 except ValueError:
418 ids_in_page.append(video_id)
419 titles_in_page.append(video_title)
351f37c0 420
9833e7a0
LR
421 for video_id, video_title in zip(ids_in_page, titles_in_page):
422 yield self.url_result(video_id, 'Youtube', video_id, video_title)
648e6a1f
S
423
424
061a75ed 425class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
9833e7a0
LR
426 def _is_entry(self, obj):
427 return 'playlistId' in obj
428
429 def _process_entries(self, entries, seen):
430 for playlist_id in orderedSet(try_get(r, lambda x: x['playlistId']) for r in entries):
061a75ed
S
431 yield self.url_result(
432 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
433
0c148415
S
434 def _real_extract(self, url):
435 playlist_id = self._match_id(url)
436 webpage = self._download_webpage(url, playlist_id)
0c148415 437 title = self._og_search_title(webpage, fatal=False)
061a75ed 438 return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
0c148415
S
439
440
360e1ca5 441class YoutubeIE(YoutubeBaseInfoExtractor):
78caa52a 442 IE_DESC = 'YouTube.com'
cb7dfeea 443 _VALID_URL = r"""(?x)^
c5e8d7af 444 (
edb53e2d 445 (?:https?://|//) # http(s):// or protocol-independent URL
66b48727 446 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
484aaeb2 447 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 448 (?:www\.)?pwnyoutube\.com/|
8b561bfc 449 (?:www\.)?hooktube\.com/|
f7000f3a 450 (?:www\.)?yourepeat\.com/|
e69ae5b9 451 tube\.majestyc\.net/|
ba036333 452 # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
77d95677 453 (?:(?:www|dev)\.)?invidio\.us/|
ba036333 454 (?:(?:www|no)\.)?invidiou\.sh/|
455 (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/|
8ae113ca 456 (?:www\.)?invidious\.kabi\.tk/|
ba036333 457 (?:www\.)?invidious\.13ad\.de/|
791d2e81 458 (?:www\.)?invidious\.mastodon\.host/|
494d664e 459 (?:www\.)?invidious\.nixnet\.xyz/|
666d808e 460 (?:www\.)?invidious\.drycat\.fr/|
ba036333 461 (?:www\.)?tube\.poal\.co/|
8ae113ca 462 (?:www\.)?vid\.wxzm\.sx/|
384bf91f 463 (?:www\.)?yewtu\.be/|
494d664e 464 (?:www\.)?yt\.elukerio\.org/|
894b3826 465 (?:www\.)?yt\.lelux\.fi/|
1db5ab6b 466 (?:www\.)?invidious\.ggc-project\.de/|
467 (?:www\.)?yt\.maisputain\.ovh/|
468 (?:www\.)?invidious\.13ad\.de/|
469 (?:www\.)?invidious\.toot\.koeln/|
470 (?:www\.)?invidious\.fdn\.fr/|
471 (?:www\.)?watch\.nettohikari\.com/|
bff90fc5 472 (?:www\.)?kgg2m7yk5aybusll\.onion/|
473 (?:www\.)?qklhadlycap4cnod\.onion/|
474 (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
475 (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
476 (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
477 (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
33c1c7d8 478 (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
1db5ab6b 479 (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
e69ae5b9 480 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
481 (?:.*?\#/)? # handle anchor (#/) redirect urls
482 (?: # the various things that can precede the ID:
ac7553d0 483 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 484 |(?: # or the v= param in all its forms
f7000f3a 485 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af 486 (?:\?|\#!?) # the params delimiter ? or # or #!
040ac686 487 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
c5e8d7af
PH
488 v=
489 )
f4b05232 490 ))
cbaed4bb
S
491 |(?:
492 youtu\.be| # just youtu.be/xxxx
6d4fc66b
S
493 vid\.plus| # or vid.plus/xxxx
494 zwearz\.com/watch| # or zwearz.com/watch/xxxx
cbaed4bb 495 )/
edb53e2d 496 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 497 )
c5e8d7af 498 )? # all until now is optional -> you can pass the naked ID
8963d9c2 499 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
d0ba5587
S
500 (?!.*?\blist=
501 (?:
502 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
503 WL # WL are handled by the watch later IE
504 )
505 )
c5e8d7af 506 (?(1).+)? # if we found the ID, everything can follow
d0ba5587 507 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
c5e8d7af 508 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
e40c758c
S
509 _PLAYER_INFO_RE = (
510 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P<ext>[a-z]+)$',
511 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.(?P<ext>[a-z]+)$',
512 )
2c62dc26 513 _formats = {
c2d3cb4c 514 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
515 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
516 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
517 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
518 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
519 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
520 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
521 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
3834d3e3 522 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
c2d3cb4c 523 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
524 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
525 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
526 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
527 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
528 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
e1a0bfdf 529 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
c2d3cb4c 530 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
531 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
e1a0bfdf 532
533
534 # 3D videos
c2d3cb4c 535 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
536 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
537 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
538 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
e1a0bfdf 539 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
540 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
541 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
836a086c 542
96fb5605 543 # Apple HTTP Live Streaming
11f12195 544 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
c2d3cb4c 545 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
546 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
547 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
548 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
549 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
e1a0bfdf 550 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
551 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
2c62dc26
PH
552
553 # DASH mp4 video
d23028a8
S
554 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
555 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
556 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
557 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
558 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
067aa17e 559 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
d23028a8
S
560 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
561 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
562 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
563 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
564 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
565 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
836a086c 566
f6f1fc92 567 # Dash mp4 audio
d23028a8
S
568 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
569 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
570 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
571 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
572 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
573 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
574 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
836a086c
AZ
575
576 # Dash webm
d23028a8
S
577 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
578 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
579 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
580 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
581 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
582 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
583 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
584 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
585 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
586 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
587 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
588 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
589 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
590 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
591 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
4c6b4764 592 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
d23028a8
S
593 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
594 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
595 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
596 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
597 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
598 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
2c62dc26
PH
599
600 # Dash webm audio
d23028a8
S
601 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
602 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
ce6b9a2d 603
0857baad 604 # Dash webm audio with opus inside
d23028a8
S
605 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
606 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
607 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
0857baad 608
ce6b9a2d
PH
609 # RTMP (unnamed)
610 '_rtmp': {'protocol': 'rtmp'},
b85eae0f
S
611
612 # av01 video only formats sometimes served with "unknown" codecs
613 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
614 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
615 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
616 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
c5e8d7af 617 }
84da5d84 618 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
836a086c 619
fd5c4aab
S
620 _GEO_BYPASS = False
621
78caa52a 622 IE_NAME = 'youtube'
2eb88d95
PH
623 _TESTS = [
624 {
2d3d2997 625 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
4bc3a23e
PH
626 'info_dict': {
627 'id': 'BaW_jenozKc',
628 'ext': 'mp4',
3867038a 629 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
4bc3a23e
PH
630 'uploader': 'Philipp Hagemeister',
631 'uploader_id': 'phihag',
ec85ded8 632 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
dd4c4492
S
633 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
634 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
4bc3a23e 635 'upload_date': '20121002',
3867038a 636 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
4bc3a23e 637 'categories': ['Science & Technology'],
3867038a 638 'tags': ['youtube-dl'],
556dbe7f 639 'duration': 10,
dbdaaa23 640 'view_count': int,
3e7c1224
PH
641 'like_count': int,
642 'dislike_count': int,
7c80519c 643 'start_time': 1,
297a564b 644 'end_time': 9,
2eb88d95 645 }
0e853ca4 646 },
fccd3771 647 {
4bc3a23e
PH
648 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
649 'note': 'Embed-only video (#1746)',
650 'info_dict': {
651 'id': 'yZIXLfi8CZQ',
652 'ext': 'mp4',
653 'upload_date': '20120608',
654 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
655 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
656 'uploader': 'SET India',
94bfcd23 657 'uploader_id': 'setindia',
ec85ded8 658 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
94bfcd23 659 'age_limit': 18,
fccd3771
PH
660 }
661 },
11b56058 662 {
2d3d2997 663 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
11b56058
PM
664 'note': 'Use the first video ID in the URL',
665 'info_dict': {
666 'id': 'BaW_jenozKc',
667 'ext': 'mp4',
3867038a 668 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
11b56058
PM
669 'uploader': 'Philipp Hagemeister',
670 'uploader_id': 'phihag',
ec85ded8 671 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
11b56058 672 'upload_date': '20121002',
3867038a 673 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
11b56058 674 'categories': ['Science & Technology'],
3867038a 675 'tags': ['youtube-dl'],
556dbe7f 676 'duration': 10,
dbdaaa23 677 'view_count': int,
11b56058
PM
678 'like_count': int,
679 'dislike_count': int,
34a7de29
S
680 },
681 'params': {
682 'skip_download': True,
683 },
11b56058 684 },
dd27fd17 685 {
2d3d2997 686 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
4bc3a23e
PH
687 'note': '256k DASH audio (format 141) via DASH manifest',
688 'info_dict': {
689 'id': 'a9LDPn-MO4I',
690 'ext': 'm4a',
691 'upload_date': '20121002',
692 'uploader_id': '8KVIDEO',
ec85ded8 693 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
4bc3a23e
PH
694 'description': '',
695 'uploader': '8KVIDEO',
696 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 697 },
4bc3a23e
PH
698 'params': {
699 'youtube_include_dash_manifest': True,
700 'format': '141',
4919603f 701 },
de3c7fe0 702 'skip': 'format 141 not served anymore',
dd27fd17 703 },
aa79ac0c
PH
704 # Controversy video
705 {
706 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
707 'info_dict': {
708 'id': 'T4XJQO3qol8',
709 'ext': 'mp4',
556dbe7f 710 'duration': 219,
aa79ac0c 711 'upload_date': '20100909',
4fe54c12 712 'uploader': 'Amazing Atheist',
aa79ac0c 713 'uploader_id': 'TheAmazingAtheist',
ec85ded8 714 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
aa79ac0c
PH
715 'title': 'Burning Everyone\'s Koran',
716 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
717 }
c522adb1 718 },
dd2d55f1 719 # Normal age-gate video (embed allowed)
c522adb1 720 {
2d3d2997 721 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
c522adb1
JMF
722 'info_dict': {
723 'id': 'HtVdAasjOgU',
724 'ext': 'mp4',
725 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
ec85ded8 726 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
556dbe7f 727 'duration': 142,
c522adb1
JMF
728 'uploader': 'The Witcher',
729 'uploader_id': 'WitcherGame',
ec85ded8 730 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
c522adb1 731 'upload_date': '20140605',
34952f09 732 'age_limit': 18,
c522adb1
JMF
733 },
734 },
067aa17e 735 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
e52a40ab
PH
736 {
737 'url': 'lqQg6PlCWgI',
738 'info_dict': {
739 'id': 'lqQg6PlCWgI',
740 'ext': 'mp4',
556dbe7f 741 'duration': 6085,
90227264 742 'upload_date': '20150827',
cbe2bd91 743 'uploader_id': 'olympic',
ec85ded8 744 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
cbe2bd91 745 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
be49068d 746 'uploader': 'Olympic',
cbe2bd91
PH
747 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
748 },
749 'params': {
750 'skip_download': 'requires avconv',
e52a40ab 751 }
cbe2bd91 752 },
6271f1ca
PH
753 # Non-square pixels
754 {
755 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
756 'info_dict': {
757 'id': '_b-2C3KPAM0',
758 'ext': 'mp4',
759 'stretched_ratio': 16 / 9.,
556dbe7f 760 'duration': 85,
6271f1ca
PH
761 'upload_date': '20110310',
762 'uploader_id': 'AllenMeow',
ec85ded8 763 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
6271f1ca 764 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
eb6793ba 765 'uploader': '孫ᄋᄅ',
6271f1ca
PH
766 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
767 },
06b491eb
S
768 },
769 # url_encoded_fmt_stream_map is empty string
770 {
771 'url': 'qEJwOuvDf7I',
772 'info_dict': {
773 'id': 'qEJwOuvDf7I',
f57b7835 774 'ext': 'webm',
06b491eb
S
775 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
776 'description': '',
777 'upload_date': '20150404',
778 'uploader_id': 'spbelect',
779 'uploader': 'Наблюдатели Петербурга',
780 },
781 'params': {
782 'skip_download': 'requires avconv',
e323cf3f
S
783 },
784 'skip': 'This live event has ended.',
06b491eb 785 },
067aa17e 786 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
da77d856
S
787 {
788 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
789 'info_dict': {
790 'id': 'FIl7x6_3R5Y',
eb6793ba 791 'ext': 'webm',
da77d856
S
792 'title': 'md5:7b81415841e02ecd4313668cde88737a',
793 'description': 'md5:116377fd2963b81ec4ce64b542173306',
556dbe7f 794 'duration': 220,
da77d856
S
795 'upload_date': '20150625',
796 'uploader_id': 'dorappi2000',
ec85ded8 797 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
da77d856 798 'uploader': 'dorappi2000',
eb6793ba 799 'formats': 'mincount:31',
da77d856 800 },
eb6793ba 801 'skip': 'not actual anymore',
2ee8f5d8 802 },
8a1a26ce
YCH
803 # DASH manifest with segment_list
804 {
805 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
806 'md5': '8ce563a1d667b599d21064e982ab9e31',
807 'info_dict': {
808 'id': 'CsmdDsKjzN8',
809 'ext': 'mp4',
17ee98e1 810 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
8a1a26ce
YCH
811 'uploader': 'Airtek',
812 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
813 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
814 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
815 },
816 'params': {
817 'youtube_include_dash_manifest': True,
818 'format': '135', # bestvideo
be49068d
S
819 },
820 'skip': 'This live event has ended.',
2ee8f5d8 821 },
cf7e015f
S
822 {
823 # Multifeed videos (multiple cameras), URL is for Main Camera
824 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
825 'info_dict': {
826 'id': 'jqWvoWXjCVs',
827 'title': 'teamPGP: Rocket League Noob Stream',
828 'description': 'md5:dc7872fb300e143831327f1bae3af010',
829 },
830 'playlist': [{
831 'info_dict': {
832 'id': 'jqWvoWXjCVs',
833 'ext': 'mp4',
834 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
835 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 836 'duration': 7335,
cf7e015f
S
837 'upload_date': '20150721',
838 'uploader': 'Beer Games Beer',
839 'uploader_id': 'beergamesbeer',
ec85ded8 840 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 841 'license': 'Standard YouTube License',
cf7e015f
S
842 },
843 }, {
844 'info_dict': {
845 'id': '6h8e8xoXJzg',
846 'ext': 'mp4',
847 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
848 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 849 'duration': 7337,
cf7e015f
S
850 'upload_date': '20150721',
851 'uploader': 'Beer Games Beer',
852 'uploader_id': 'beergamesbeer',
ec85ded8 853 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 854 'license': 'Standard YouTube License',
cf7e015f
S
855 },
856 }, {
857 'info_dict': {
858 'id': 'PUOgX5z9xZw',
859 'ext': 'mp4',
860 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
861 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 862 'duration': 7337,
cf7e015f
S
863 'upload_date': '20150721',
864 'uploader': 'Beer Games Beer',
865 'uploader_id': 'beergamesbeer',
ec85ded8 866 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 867 'license': 'Standard YouTube License',
cf7e015f
S
868 },
869 }, {
870 'info_dict': {
871 'id': 'teuwxikvS5k',
872 'ext': 'mp4',
873 'title': 'teamPGP: Rocket League Noob Stream (zim)',
874 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 875 'duration': 7334,
cf7e015f
S
876 'upload_date': '20150721',
877 'uploader': 'Beer Games Beer',
878 'uploader_id': 'beergamesbeer',
ec85ded8 879 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 880 'license': 'Standard YouTube License',
cf7e015f
S
881 },
882 }],
883 'params': {
884 'skip_download': True,
885 },
4fe54c12 886 'skip': 'This video is not available.',
cbaed4bb 887 },
f9f49d87 888 {
067aa17e 889 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
f9f49d87
S
890 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
891 'info_dict': {
892 'id': 'gVfLd0zydlo',
893 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
894 },
895 'playlist_count': 2,
be49068d 896 'skip': 'Not multifeed anymore',
f9f49d87 897 },
cbaed4bb 898 {
2d3d2997 899 'url': 'https://vid.plus/FlRa-iH7PGw',
cbaed4bb 900 'only_matching': True,
0e49d9a6 901 },
6d4fc66b 902 {
2d3d2997 903 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
6d4fc66b
S
904 'only_matching': True,
905 },
0e49d9a6 906 {
067aa17e 907 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
a8776b10 908 # Also tests cut-off URL expansion in video description (see
067aa17e
S
909 # https://github.com/ytdl-org/youtube-dl/issues/1892,
910 # https://github.com/ytdl-org/youtube-dl/issues/8164)
0e49d9a6
LL
911 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
912 'info_dict': {
913 'id': 'lsguqyKfVQg',
914 'ext': 'mp4',
915 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
eb6793ba 916 'alt_title': 'Dark Walk - Position Music',
0e49d9a6 917 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
556dbe7f 918 'duration': 133,
0e49d9a6
LL
919 'upload_date': '20151119',
920 'uploader_id': 'IronSoulElf',
ec85ded8 921 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
0e49d9a6 922 'uploader': 'IronSoulElf',
eb6793ba
S
923 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
924 'track': 'Dark Walk - Position Music',
925 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
92bc97d3 926 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
0e49d9a6
LL
927 },
928 'params': {
929 'skip_download': True,
930 },
931 },
61f92af1 932 {
067aa17e 933 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
61f92af1
S
934 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
935 'only_matching': True,
936 },
313dfc45
LL
937 {
938 # Video with yt:stretch=17:0
939 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
940 'info_dict': {
941 'id': 'Q39EVAstoRM',
942 'ext': 'mp4',
943 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
944 'description': 'md5:ee18a25c350637c8faff806845bddee9',
945 'upload_date': '20151107',
946 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
947 'uploader': 'CH GAMER DROID',
948 },
949 'params': {
950 'skip_download': True,
951 },
be49068d 952 'skip': 'This video does not exist.',
313dfc45 953 },
7caf9830
S
954 {
955 # Video licensed under Creative Commons
956 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
957 'info_dict': {
958 'id': 'M4gD1WSo5mA',
959 'ext': 'mp4',
960 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
961 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
556dbe7f 962 'duration': 721,
7caf9830
S
963 'upload_date': '20150127',
964 'uploader_id': 'BerkmanCenter',
ec85ded8 965 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
556dbe7f 966 'uploader': 'The Berkman Klein Center for Internet & Society',
7caf9830
S
967 'license': 'Creative Commons Attribution license (reuse allowed)',
968 },
969 'params': {
970 'skip_download': True,
971 },
972 },
fd050249
S
973 {
974 # Channel-like uploader_url
975 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
976 'info_dict': {
977 'id': 'eQcmzGIKrzg',
978 'ext': 'mp4',
979 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
980 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
556dbe7f 981 'duration': 4060,
fd050249 982 'upload_date': '20151119',
eb6793ba 983 'uploader': 'Bernie Sanders',
fd050249 984 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
ec85ded8 985 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
fd050249
S
986 'license': 'Creative Commons Attribution license (reuse allowed)',
987 },
988 'params': {
989 'skip_download': True,
990 },
991 },
040ac686
S
992 {
993 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
994 'only_matching': True,
7f29cf54
S
995 },
996 {
067aa17e 997 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
7f29cf54
S
998 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
999 'only_matching': True,
6496ccb4
S
1000 },
1001 {
1002 # Rental video preview
1003 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
1004 'info_dict': {
1005 'id': 'uGpuVWrhIzE',
1006 'ext': 'mp4',
1007 'title': 'Piku - Trailer',
1008 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
1009 'upload_date': '20150811',
1010 'uploader': 'FlixMatrix',
1011 'uploader_id': 'FlixMatrixKaravan',
ec85ded8 1012 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
6496ccb4
S
1013 'license': 'Standard YouTube License',
1014 },
1015 'params': {
1016 'skip_download': True,
1017 },
eb6793ba 1018 'skip': 'This video is not available.',
022a5d66 1019 },
12afdc2a
S
1020 {
1021 # YouTube Red video with episode data
1022 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
1023 'info_dict': {
1024 'id': 'iqKdEhx-dD4',
1025 'ext': 'mp4',
1026 'title': 'Isolation - Mind Field (Ep 1)',
4fe54c12 1027 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
556dbe7f 1028 'duration': 2085,
12afdc2a
S
1029 'upload_date': '20170118',
1030 'uploader': 'Vsauce',
1031 'uploader_id': 'Vsauce',
1032 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
12afdc2a
S
1033 'series': 'Mind Field',
1034 'season_number': 1,
1035 'episode_number': 1,
1036 },
1037 'params': {
1038 'skip_download': True,
1039 },
1040 'expected_warnings': [
1041 'Skipping DASH manifest',
1042 ],
1043 },
c7121fa7
S
1044 {
1045 # The following content has been identified by the YouTube community
1046 # as inappropriate or offensive to some audiences.
1047 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
1048 'info_dict': {
1049 'id': '6SJNVb0GnPI',
1050 'ext': 'mp4',
1051 'title': 'Race Differences in Intelligence',
1052 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
1053 'duration': 965,
1054 'upload_date': '20140124',
1055 'uploader': 'New Century Foundation',
1056 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
1057 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
c7121fa7
S
1058 },
1059 'params': {
1060 'skip_download': True,
1061 },
1062 },
022a5d66
S
1063 {
1064 # itag 212
1065 'url': '1t24XAntNCY',
1066 'only_matching': True,
fd5c4aab
S
1067 },
1068 {
1069 # geo restricted to JP
1070 'url': 'sJL6WA-aGkQ',
1071 'only_matching': True,
1072 },
d0ba5587
S
1073 {
1074 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
1075 'only_matching': True,
1076 },
cd5a74a2
S
1077 {
1078 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1079 'only_matching': True,
1080 },
825cd268
RA
1081 {
1082 # DRM protected
1083 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1084 'only_matching': True,
4fe54c12
S
1085 },
1086 {
1087 # Video with unsupported adaptive stream type formats
1088 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1089 'info_dict': {
1090 'id': 'Z4Vy8R84T1U',
1091 'ext': 'mp4',
1092 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1093 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1094 'duration': 433,
1095 'upload_date': '20130923',
1096 'uploader': 'Amelia Putri Harwita',
1097 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1098 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1099 'formats': 'maxcount:10',
1100 },
1101 'params': {
1102 'skip_download': True,
1103 'youtube_include_dash_manifest': False,
1104 },
5429d6a9 1105 'skip': 'not actual anymore',
5caabd3c 1106 },
1107 {
822b9d9c 1108 # Youtube Music Auto-generated description
5caabd3c 1109 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1110 'info_dict': {
1111 'id': 'MgNrAu2pzNs',
1112 'ext': 'mp4',
1113 'title': 'Voyeur Girl',
1114 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1115 'upload_date': '20190312',
5429d6a9
S
1116 'uploader': 'Stephen - Topic',
1117 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
5caabd3c 1118 'artist': 'Stephen',
1119 'track': 'Voyeur Girl',
1120 'album': 'it\'s too much love to know my dear',
1121 'release_date': '20190313',
1122 'release_year': 2019,
1123 },
1124 'params': {
1125 'skip_download': True,
1126 },
1127 },
1128 {
822b9d9c 1129 # Youtube Music Auto-generated description
5caabd3c 1130 # Retrieve 'artist' field from 'Artist:' in video description
1131 # when it is present on youtube music video
5caabd3c 1132 'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY',
1133 'info_dict': {
1134 'id': 'k0jLE7tTwjY',
1135 'ext': 'mp4',
1136 'title': 'Latch Feat. Sam Smith',
1137 'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335',
1138 'upload_date': '20150110',
1139 'uploader': 'Various Artists - Topic',
1140 'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w',
1141 'artist': 'Disclosure',
1142 'track': 'Latch Feat. Sam Smith',
1143 'album': 'Latch Featuring Sam Smith',
1144 'release_date': '20121008',
1145 'release_year': 2012,
1146 },
1147 'params': {
1148 'skip_download': True,
1149 },
1150 },
1151 {
822b9d9c 1152 # Youtube Music Auto-generated description
5caabd3c 1153 # handle multiple artists on youtube music video
1154 'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA',
1155 'info_dict': {
1156 'id': '74qn0eJSjpA',
1157 'ext': 'mp4',
1158 'title': 'Eastside',
1159 'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2',
1160 'upload_date': '20180710',
1161 'uploader': 'Benny Blanco - Topic',
1162 'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A',
1163 'artist': 'benny blanco, Halsey, Khalid',
1164 'track': 'Eastside',
1165 'album': 'Eastside',
1166 'release_date': '20180713',
1167 'release_year': 2018,
1168 },
1169 'params': {
1170 'skip_download': True,
1171 },
1172 },
1173 {
822b9d9c 1174 # Youtube Music Auto-generated description
5caabd3c 1175 # handle youtube music video with release_year and no release_date
1176 'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M',
1177 'info_dict': {
1178 'id': '-hcAI0g-f5M',
1179 'ext': 'mp4',
1180 'title': 'Put It On Me',
5429d6a9 1181 'description': 'md5:f6422397c07c4c907c6638e1fee380a5',
5caabd3c 1182 'upload_date': '20180426',
1183 'uploader': 'Matt Maeson - Topic',
1184 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ',
1185 'artist': 'Matt Maeson',
1186 'track': 'Put It On Me',
1187 'album': 'The Hearse',
1188 'release_date': None,
1189 'release_year': 2018,
1190 },
1191 'params': {
1192 'skip_download': True,
1193 },
1194 },
66b48727
RA
1195 {
1196 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1197 'only_matching': True,
1198 },
011e75e6
S
1199 {
1200 # invalid -> valid video id redirection
1201 'url': 'DJztXj2GPfl',
1202 'info_dict': {
1203 'id': 'DJztXj2GPfk',
1204 'ext': 'mp4',
1205 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1206 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1207 'upload_date': '20090125',
1208 'uploader': 'Prochorowka',
1209 'uploader_id': 'Prochorowka',
1210 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1211 'artist': 'Panjabi MC',
1212 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1213 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1214 },
1215 'params': {
1216 'skip_download': True,
1217 },
ea74e00b
DP
1218 },
1219 {
1220 # empty description results in an empty string
1221 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1222 'info_dict': {
1223 'id': 'x41yOUIvK2k',
1224 'ext': 'mp4',
1225 'title': 'IMG 3456',
1226 'description': '',
1227 'upload_date': '20170613',
1228 'uploader_id': 'ElevageOrVert',
1229 'uploader': 'ElevageOrVert',
1230 },
1231 'params': {
1232 'skip_download': True,
1233 },
1234 },
2eb88d95
PH
1235 ]
1236
e0df6211
PH
1237 def __init__(self, *args, **kwargs):
1238 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 1239 self._player_cache = {}
e0df6211 1240
c5e8d7af
PH
1241 def report_video_info_webpage_download(self, video_id):
1242 """Report attempt to download video info webpage."""
69ea8ca4 1243 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 1244
c5e8d7af
PH
1245 def report_information_extraction(self, video_id):
1246 """Report attempt to extract video information."""
69ea8ca4 1247 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
1248
1249 def report_unavailable_format(self, video_id, format):
1250 """Report extracted video URL."""
69ea8ca4 1251 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
1252
1253 def report_rtmp_download(self):
1254 """Indicate the download will use the RTMP protocol."""
69ea8ca4 1255 self.to_screen('RTMP download detected')
c5e8d7af 1256
60064c53
PH
1257 def _signature_cache_id(self, example_sig):
1258 """ Return a string representation of a signature """
78caa52a 1259 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53 1260
e40c758c
S
1261 @classmethod
1262 def _extract_player_info(cls, player_url):
1263 for player_re in cls._PLAYER_INFO_RE:
1264 id_m = re.search(player_re, player_url)
1265 if id_m:
1266 break
1267 else:
c081b35c 1268 raise ExtractorError('Cannot identify player %r' % player_url)
e40c758c
S
1269 return id_m.group('ext'), id_m.group('id')
1270
1271 def _extract_signature_function(self, video_id, player_url, example_sig):
1272 player_type, player_id = self._extract_player_info(player_url)
e0df6211 1273
c4417ddb 1274 # Read from filesystem cache
60064c53
PH
1275 func_id = '%s_%s_%s' % (
1276 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 1277 assert os.path.basename(func_id) == func_id
a0e07d31 1278
69ea8ca4 1279 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 1280 if cache_spec is not None:
78caa52a 1281 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 1282
6d1a55a5
PH
1283 download_note = (
1284 'Downloading player %s' % player_url
1285 if self._downloader.params.get('verbose') else
1286 'Downloading %s player %s' % (player_type, player_id)
1287 )
e0df6211
PH
1288 if player_type == 'js':
1289 code = self._download_webpage(
1290 player_url, video_id,
6d1a55a5 1291 note=download_note,
69ea8ca4 1292 errnote='Download of %s failed' % player_url)
83799698 1293 res = self._parse_sig_js(code)
c4417ddb 1294 elif player_type == 'swf':
e0df6211
PH
1295 urlh = self._request_webpage(
1296 player_url, video_id,
6d1a55a5 1297 note=download_note,
69ea8ca4 1298 errnote='Download of %s failed' % player_url)
e0df6211 1299 code = urlh.read()
83799698 1300 res = self._parse_sig_swf(code)
e0df6211
PH
1301 else:
1302 assert False, 'Invalid player type %r' % player_type
1303
785521bf
PH
1304 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1305 cache_res = res(test_string)
1306 cache_spec = [ord(c) for c in cache_res]
83799698 1307
69ea8ca4 1308 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
1309 return res
1310
60064c53 1311 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
1312 def gen_sig_code(idxs):
1313 def _genslice(start, end, step):
78caa52a 1314 starts = '' if start == 0 else str(start)
8bcc8756 1315 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 1316 steps = '' if step == 1 else (':%d' % step)
78caa52a 1317 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
1318
1319 step = None
7af808a5
PH
1320 # Quelch pyflakes warnings - start will be set when step is set
1321 start = '(Never used)'
edf3e38e
PH
1322 for i, prev in zip(idxs[1:], idxs[:-1]):
1323 if step is not None:
1324 if i - prev == step:
1325 continue
1326 yield _genslice(start, prev, step)
1327 step = None
1328 continue
1329 if i - prev in [-1, 1]:
1330 step = i - prev
1331 start = prev
1332 continue
1333 else:
78caa52a 1334 yield 's[%d]' % prev
edf3e38e 1335 if step is None:
78caa52a 1336 yield 's[%d]' % i
edf3e38e
PH
1337 else:
1338 yield _genslice(start, i, step)
1339
78caa52a 1340 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 1341 cache_res = func(test_string)
edf3e38e 1342 cache_spec = [ord(c) for c in cache_res]
78caa52a 1343 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
1344 signature_id_tuple = '(%s)' % (
1345 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 1346 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 1347 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 1348 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 1349
e0df6211
PH
1350 def _parse_sig_js(self, jscode):
1351 funcname = self._search_regex(
abefc03f
S
1352 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1353 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
e450f6cb 1354 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
31ce6e99 1355 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
abefc03f
S
1356 # Obsolete patterns
1357 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
9a47fa35 1358 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
abefc03f
S
1359 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1360 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1361 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1362 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1363 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1364 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
3c90cc8b 1365 jscode, 'Initial JS player signature function name', group='sig')
2b25cb5d
PH
1366
1367 jsi = JSInterpreter(jscode)
1368 initial_function = jsi.extract_function(funcname)
e0df6211
PH
1369 return lambda s: initial_function([s])
1370
1371 def _parse_sig_swf(self, file_contents):
54256267 1372 swfi = SWFInterpreter(file_contents)
78caa52a 1373 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 1374 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 1375 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
1376 return lambda s: initial_function([s])
1377
83799698 1378 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 1379 """Turn the encrypted s field into a working signature"""
6b37f0be 1380
c8bf86d5 1381 if player_url is None:
69ea8ca4 1382 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 1383
69ea8ca4 1384 if player_url.startswith('//'):
78caa52a 1385 player_url = 'https:' + player_url
3c90cc8b
S
1386 elif not re.match(r'https?://', player_url):
1387 player_url = compat_urlparse.urljoin(
1388 'https://www.youtube.com', player_url)
c8bf86d5 1389 try:
62af3a0e 1390 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
1391 if player_id not in self._player_cache:
1392 func = self._extract_signature_function(
60064c53 1393 video_id, player_url, s
c8bf86d5
PH
1394 )
1395 self._player_cache[player_id] = func
1396 func = self._player_cache[player_id]
1397 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 1398 self._print_sig_code(func, s)
c8bf86d5
PH
1399 return func(s)
1400 except Exception as e:
1401 tb = traceback.format_exc()
1402 raise ExtractorError(
78caa52a 1403 'Signature extraction failed: ' + tb, cause=e)
e0df6211 1404
f96f5dda 1405 def _get_subtitles(self, video_id, webpage, has_live_chat_replay):
de7f3446 1406 try:
60e47a26 1407 subs_doc = self._download_xml(
38c2e5b8 1408 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
1409 video_id, note=False)
1410 except ExtractorError as err:
9b9c5355 1411 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
de7f3446 1412 return {}
de7f3446
JMF
1413
1414 sub_lang_list = {}
60e47a26
JMF
1415 for track in subs_doc.findall('track'):
1416 lang = track.attrib['lang_code']
7e660ac1
LD
1417 if lang in sub_lang_list:
1418 continue
360e1ca5 1419 sub_formats = []
23d17e4b 1420 for ext in self._SUBTITLE_FORMATS:
15707c7e 1421 params = compat_urllib_parse_urlencode({
360e1ca5
JMF
1422 'lang': lang,
1423 'v': video_id,
1424 'fmt': ext,
1425 'name': track.attrib['name'].encode('utf-8'),
1426 })
1427 sub_formats.append({
1428 'url': 'https://www.youtube.com/api/timedtext?' + params,
1429 'ext': ext,
1430 })
1431 sub_lang_list[lang] = sub_formats
9f448fcb 1432 if has_live_chat_replay:
321bf820 1433 sub_lang_list['live_chat'] = [
1434 {
1435 'video_id': video_id,
1436 'ext': 'json',
1437 'protocol': 'youtube_live_chat_replay',
1438 },
9f448fcb 1439 ]
de7f3446 1440 if not sub_lang_list:
69ea8ca4 1441 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
1442 return {}
1443 return sub_lang_list
1444
a72778d3
S
1445 def _get_ytplayer_config(self, video_id, webpage):
1446 patterns = (
526b3b07
S
1447 # User data may contain arbitrary character sequences that may affect
1448 # JSON extraction with regex, e.g. when '};' is contained the second
1449 # regex won't capture the whole JSON. Yet working around by trying more
1450 # concrete regex first keeping in mind proper quoted string handling
1451 # to be implemented in future that will replace this workaround (see
067aa17e
S
1452 # https://github.com/ytdl-org/youtube-dl/issues/7468,
1453 # https://github.com/ytdl-org/youtube-dl/pull/7599)
a72778d3
S
1454 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1455 r';ytplayer\.config\s*=\s*({.+?});',
59c5fa91 1456 r'ytInitialPlayerResponse\s*=\s*({.+?});var meta'
a72778d3
S
1457 )
1458 config = self._search_regex(
1459 patterns, webpage, 'ytplayer.config', default=None)
1460 if config:
1461 return self._parse_json(
1462 uppercase_escape(config), video_id, fatal=False)
0e49d9a6 1463
9322f116 1464 def _get_music_metadata_from_yt_initial(self, yt_initial):
1465 music_metadata = []
1466 key_map = {
1467 'Album': 'album',
1468 'Artist': 'artist',
1469 'Song': 'track'
1470 }
1471 contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'])
1472 if type(contents) is list:
1473 for content in contents:
1474 music_track = {}
1475 if type(content) is not dict:
1476 continue
1477 videoSecondaryInfoRenderer = try_get(content, lambda x: x['videoSecondaryInfoRenderer'])
1478 if type(videoSecondaryInfoRenderer) is not dict:
1479 continue
1480 rows = try_get(videoSecondaryInfoRenderer, lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'])
1481 if type(rows) is not list:
1482 continue
1483 for row in rows:
1484 metadataRowRenderer = try_get(row, lambda x: x['metadataRowRenderer'])
1485 if type(metadataRowRenderer) is not dict:
1486 continue
1487 key = try_get(metadataRowRenderer, lambda x: x['title']['simpleText'])
1488 value = try_get(metadataRowRenderer, lambda x: x['contents'][0]['simpleText']) or \
1489 try_get(metadataRowRenderer, lambda x: x['contents'][0]['runs'][0]['text'])
1490 if type(key) is not str or type(value) is not str:
1491 continue
1492 if key in key_map:
1493 if key_map[key] in music_track:
1494 # we've started on a new track
1495 music_metadata.append(music_track)
1496 music_track = {}
1497 music_track[key_map[key]] = value
1498 if len(music_track.keys()):
1499 music_metadata.append(music_track)
1500 return music_metadata
1501
360e1ca5 1502 def _get_automatic_captions(self, video_id, webpage):
de7f3446
JMF
1503 """We need the webpage for getting the captions url, pass it as an
1504 argument to speed up the process."""
69ea8ca4 1505 self.to_screen('%s: Looking for automatic captions' % video_id)
a72778d3 1506 player_config = self._get_ytplayer_config(video_id, webpage)
78caa52a 1507 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
a72778d3 1508 if not player_config:
de7f3446
JMF
1509 self._downloader.report_warning(err_msg)
1510 return {}
de7f3446 1511 try:
59c5fa91
PO
1512 if "args" in player_config and "ttsurl" in player_config["args"]:
1513 args = player_config['args']
1514 caption_url = args['ttsurl']
b78b292f 1515 timestamp = args['timestamp']
59c5fa91 1516
b78b292f 1517 # We get the available subtitles
15707c7e 1518 list_params = compat_urllib_parse_urlencode({
b78b292f
S
1519 'type': 'list',
1520 'tlangs': 1,
1521 'asrs': 1,
1522 })
1523 list_url = caption_url + '&' + list_params
1524 caption_list = self._download_xml(list_url, video_id)
1525 original_lang_node = caption_list.find('track')
1526 if original_lang_node is None:
1527 self._downloader.report_warning('Video doesn\'t have automatic captions')
1528 return {}
1529 original_lang = original_lang_node.attrib['lang_code']
1530 caption_kind = original_lang_node.attrib.get('kind', '')
1531
1532 sub_lang_list = {}
1533 for lang_node in caption_list.findall('target'):
1534 sub_lang = lang_node.attrib['lang_code']
1535 sub_formats = []
1536 for ext in self._SUBTITLE_FORMATS:
15707c7e 1537 params = compat_urllib_parse_urlencode({
b78b292f
S
1538 'lang': original_lang,
1539 'tlang': sub_lang,
1540 'fmt': ext,
1541 'ts': timestamp,
1542 'kind': caption_kind,
1543 })
1544 sub_formats.append({
1545 'url': caption_url + '&' + params,
1546 'ext': ext,
1547 })
1548 sub_lang_list[sub_lang] = sub_formats
1549 return sub_lang_list
1550
ddbb4c5c
S
1551 def make_captions(sub_url, sub_langs):
1552 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1553 caption_qs = compat_parse_qs(parsed_sub_url.query)
1554 captions = {}
1555 for sub_lang in sub_langs:
1556 sub_formats = []
1557 for ext in self._SUBTITLE_FORMATS:
1558 caption_qs.update({
1559 'tlang': [sub_lang],
1560 'fmt': [ext],
1561 })
1562 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1563 query=compat_urllib_parse_urlencode(caption_qs, True)))
1564 sub_formats.append({
1565 'url': sub_url,
1566 'ext': ext,
1567 })
1568 captions[sub_lang] = sub_formats
1569 return captions
1570
1571 # New captions format as of 22.06.2017
59c5fa91
PO
1572 if "args" in player_config:
1573 player_response = player_config["args"].get('player_response')
1574 else:
1575 # New player system (ytInitialPlayerResponse) as of October 2020
1576 player_response = player_config
1577
1578 if player_response:
1579 if isinstance(player_response, compat_str):
1580 player_response = self._parse_json(
1581 player_response, video_id, fatal=False)
1582
1583 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1584 caption_tracks = renderer['captionTracks']
1585 for caption_track in caption_tracks:
1586 if 'kind' not in caption_track:
1587 # not an automatic transcription
1588 continue
1589 base_url = caption_track['baseUrl']
1590 sub_lang_list = []
1591 for lang in renderer['translationLanguages']:
1592 lang_code = lang.get('languageCode')
1593 if lang_code:
1594 sub_lang_list.append(lang_code)
1595 return make_captions(base_url, sub_lang_list)
1596
1597 self._downloader.report_warning("Couldn't find automatic captions for %s" % video_id)
1598 return {}
1599
1600 if "args" in player_config:
1601 args = player_config["args"]
1602
1603 # Some videos don't provide ttsurl but rather caption_tracks and
1604 # caption_translation_languages (e.g. 20LmZk1hakA)
1605 # Does not used anymore as of 22.06.2017
1606 caption_tracks = args['caption_tracks']
1607 caption_translation_languages = args['caption_translation_languages']
1608 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
1609 sub_lang_list = []
1610 for lang in caption_translation_languages.split(','):
1611 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1612 sub_lang = lang_qs.get('lc', [None])[0]
1613 if sub_lang:
1614 sub_lang_list.append(sub_lang)
1615 return make_captions(caption_url, sub_lang_list)
de7f3446
JMF
1616 # An extractor error can be raise by the download process if there are
1617 # no automatic captions but there are subtitles
ddbb4c5c 1618 except (KeyError, IndexError, ExtractorError):
de7f3446
JMF
1619 self._downloader.report_warning(err_msg)
1620 return {}
1621
21c340b8
S
1622 def _mark_watched(self, video_id, video_info, player_response):
1623 playback_url = url_or_none(try_get(
1624 player_response,
1625 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
1626 video_info, lambda x: x['videostats_playback_base_url'][0]))
d77ab8e2
S
1627 if not playback_url:
1628 return
1629 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1630 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1631
1632 # cpn generation algorithm is reverse engineered from base.js.
1633 # In fact it works even with dummy cpn.
1634 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1635 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1636
1637 qs.update({
1638 'ver': ['2'],
1639 'cpn': [cpn],
1640 })
1641 playback_url = compat_urlparse.urlunparse(
15707c7e 1642 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
d77ab8e2
S
1643
1644 self._download_webpage(
1645 playback_url, video_id, 'Marking watched',
1646 'Unable to mark watched', fatal=False)
1647
66c9fa36
S
1648 @staticmethod
1649 def _extract_urls(webpage):
1650 # Embedded YouTube player
1651 entries = [
1652 unescapeHTML(mobj.group('url'))
1653 for mobj in re.finditer(r'''(?x)
1654 (?:
1655 <iframe[^>]+?src=|
1656 data-video-url=|
1657 <embed[^>]+?src=|
1658 embedSWF\(?:\s*|
1659 <object[^>]+data=|
1660 new\s+SWFObject\(
1661 )
1662 (["\'])
1663 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
f2332f18 1664 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
66c9fa36
S
1665 \1''', webpage)]
1666
1667 # lazyYT YouTube embed
1668 entries.extend(list(map(
1669 unescapeHTML,
1670 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1671
1672 # Wordpress "YouTube Video Importer" plugin
1673 matches = re.findall(r'''(?x)<div[^>]+
1674 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1675 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1676 entries.extend(m[-1] for m in matches)
1677
1678 return entries
1679
1680 @staticmethod
1681 def _extract_url(webpage):
1682 urls = YoutubeIE._extract_urls(webpage)
1683 return urls[0] if urls else None
1684
97665381
PH
1685 @classmethod
1686 def extract_id(cls, url):
1687 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 1688 if mobj is None:
69ea8ca4 1689 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
1690 video_id = mobj.group(2)
1691 return video_id
1692
84213ea8
S
1693 def _extract_chapters_from_json(self, webpage, video_id, duration):
1694 if not webpage:
1695 return
edd83104 1696 initial_data = self._parse_json(
84213ea8 1697 self._search_regex(
edd83104 1698 r'window\["ytInitialData"\] = (.+);\n', webpage,
84213ea8
S
1699 'player args', default='{}'),
1700 video_id, fatal=False)
edd83104 1701 if not initial_data or not isinstance(initial_data, dict):
84213ea8
S
1702 return
1703 chapters_list = try_get(
edd83104 1704 initial_data,
84213ea8
S
1705 lambda x: x['playerOverlays']
1706 ['playerOverlayRenderer']
1707 ['decoratedPlayerBarRenderer']
1708 ['decoratedPlayerBarRenderer']
1709 ['playerBar']
1710 ['chapteredPlayerBarRenderer']
1711 ['chapters'],
1712 list)
1713 if not chapters_list:
1714 return
1715
1716 def chapter_time(chapter):
1717 return float_or_none(
1718 try_get(
1719 chapter,
1720 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1721 int),
1722 scale=1000)
1723 chapters = []
1724 for next_num, chapter in enumerate(chapters_list, start=1):
1725 start_time = chapter_time(chapter)
1726 if start_time is None:
1727 continue
1728 end_time = (chapter_time(chapters_list[next_num])
1729 if next_num < len(chapters_list) else duration)
1730 if end_time is None:
1731 continue
1732 title = try_get(
1733 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1734 compat_str)
1735 chapters.append({
1736 'start_time': start_time,
1737 'end_time': end_time,
1738 'title': title,
1739 })
1740 return chapters
1741
9cafc3fd 1742 @staticmethod
84213ea8 1743 def _extract_chapters_from_description(description, duration):
9cafc3fd
S
1744 if not description:
1745 return None
1746 chapter_lines = re.findall(
1747 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1748 description)
1749 if not chapter_lines:
1750 return None
1751 chapters = []
1752 for next_num, (chapter_line, time_point) in enumerate(
1753 chapter_lines, start=1):
1754 start_time = parse_duration(time_point)
1755 if start_time is None:
1756 continue
39d4c1be
S
1757 if start_time > duration:
1758 break
9cafc3fd
S
1759 end_time = (duration if next_num == len(chapter_lines)
1760 else parse_duration(chapter_lines[next_num][1]))
1761 if end_time is None:
1762 continue
39d4c1be
S
1763 if end_time > duration:
1764 end_time = duration
1765 if start_time > end_time:
1766 break
9cafc3fd
S
1767 chapter_title = re.sub(
1768 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1769 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1770 chapters.append({
1771 'start_time': start_time,
1772 'end_time': end_time,
1773 'title': chapter_title,
1774 })
1775 return chapters
1776
84213ea8
S
1777 def _extract_chapters(self, webpage, description, video_id, duration):
1778 return (self._extract_chapters_from_json(webpage, video_id, duration)
1779 or self._extract_chapters_from_description(description, duration))
1780
c5e8d7af 1781 def _real_extract(self, url):
cf7e015f
S
1782 url, smuggled_data = unsmuggle_url(url, {})
1783
7e8c0af0 1784 proto = (
78caa52a
PH
1785 'http' if self._downloader.params.get('prefer_insecure', False)
1786 else 'https')
7e8c0af0 1787
7c80519c 1788 start_time = None
297a564b 1789 end_time = None
7c80519c
JMF
1790 parsed_url = compat_urllib_parse_urlparse(url)
1791 for component in [parsed_url.fragment, parsed_url.query]:
1792 query = compat_parse_qs(component)
297a564b 1793 if start_time is None and 't' in query:
7c80519c 1794 start_time = parse_duration(query['t'][0])
2929fa0e
JMF
1795 if start_time is None and 'start' in query:
1796 start_time = parse_duration(query['start'][0])
297a564b
JMF
1797 if end_time is None and 'end' in query:
1798 end_time = parse_duration(query['end'][0])
7c80519c 1799
c5e8d7af
PH
1800 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1801 mobj = re.search(self._NEXT_URL_RE, url)
1802 if mobj:
7fd002c0 1803 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
97665381 1804 video_id = self.extract_id(url)
c5e8d7af
PH
1805
1806 # Get video webpage
aa79ac0c 1807 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
011e75e6
S
1808 video_webpage, urlh = self._download_webpage_handle(url, video_id)
1809
1810 qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query)
1811 video_id = qs.get('v', [None])[0] or video_id
c5e8d7af
PH
1812
1813 # Attempt to extract SWF player URL
e0df6211 1814 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1815 if mobj is not None:
1816 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1817 else:
1818 player_url = None
1819
d8d24a92
S
1820 dash_mpds = []
1821
1822 def add_dash_mpd(video_info):
1823 dash_mpd = video_info.get('dashmpd')
1824 if dash_mpd and dash_mpd[0] not in dash_mpds:
1825 dash_mpds.append(dash_mpd[0])
1826
561b456e
S
1827 def add_dash_mpd_pr(pl_response):
1828 dash_mpd = url_or_none(try_get(
1829 pl_response, lambda x: x['streamingData']['dashManifestUrl'],
1830 compat_str))
1831 if dash_mpd and dash_mpd not in dash_mpds:
1832 dash_mpds.append(dash_mpd)
1833
c7121fa7
S
1834 is_live = None
1835 view_count = None
1836
1837 def extract_view_count(v_info):
1838 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1839
c2d125d9
S
1840 def extract_player_response(player_response, video_id):
1841 pl_response = str_or_none(player_response)
1842 if not pl_response:
1843 return
1844 pl_response = self._parse_json(pl_response, video_id, fatal=False)
1845 if isinstance(pl_response, dict):
1846 add_dash_mpd_pr(pl_response)
1847 return pl_response
1848
fb2c9277
U
1849 def extract_embedded_config(embed_webpage, video_id):
1850 embedded_config = self._search_regex(
1851 r'setConfig\(({.*})\);',
1852 embed_webpage, 'ytInitialData', default=None)
1853 if embedded_config:
1854 return embedded_config
1855
dbdaaa23
S
1856 player_response = {}
1857
c5e8d7af 1858 # Get video info
43ebf77d 1859 video_info = {}
6449cd80 1860 embed_webpage = None
39e7107d
U
1861 if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+'
1862 or re.search(r'player-age-gate-content">', video_webpage) is not None):
9d9314cb 1863 cookie_keys = self._get_cookies('https://www.youtube.com').keys()
c108eb73
JMF
1864 age_gate = True
1865 # We simulate the access to the video from www.youtube.com/v/{video_id}
1866 # this can be viewed without login into Youtube
beb95e77
CL
1867 url = proto + '://www.youtube.com/embed/%s' % video_id
1868 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
fb2c9277
U
1869 ext = extract_embedded_config(embed_webpage, video_id)
1870 # playabilityStatus = re.search(r'{\\\"status\\\":\\\"(?P<playabilityStatus>[^\"]+)\\\"', ext)
1871 playable_in_embed = re.search(r'{\\\"playableInEmbed\\\":(?P<playableinEmbed>[^\,]+)', ext)
1872 if not playable_in_embed:
1873 self.to_screen('Could not determine whether playabale in embed for video %s' % video_id)
1874 playable_in_embed = ''
1875 else:
1876 playable_in_embed = playable_in_embed.group('playableinEmbed')
1877 # check if video is only playable on youtube in other words not playable in embed - if so it requires auth (cookies)
1878 # if re.search(r'player-unavailable">', embed_webpage) is not None:
1879 if playable_in_embed == 'false':
c73baf23
U
1880 '''
1881 # TODO apply this patch when Support for Python 2.6(!) and above drops
9d9314cb 1882 if ({'VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID'} <= cookie_keys
4bb9c880 1883 or {'VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO'} <= cookie_keys):
c73baf23
U
1884 '''
1885 if (set(('VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID')) <= set(cookie_keys)
1886 or set(('VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO')) <= set(cookie_keys)):
4bb9c880
U
1887 age_gate = False
1888 # Try looking directly into the video webpage
1889 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1890 if ytplayer_config:
59c5fa91
PO
1891 args = ytplayer_config.get("args")
1892 if args is not None:
1893 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1894 # Convert to the same format returned by compat_parse_qs
1895 video_info = dict((k, [v]) for k, v in args.items())
1896 add_dash_mpd(video_info)
1897 # Rental video is not rented but preview is available (e.g.
1898 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1899 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1900 if not video_info and args.get('ypc_vid'):
1901 return self.url_result(
1902 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1903 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1904 is_live = True
1905 if not player_response:
1906 player_response = extract_player_response(args.get('player_response'), video_id)
1907 elif not player_response:
1908 player_response = ytplayer_config
4bb9c880
U
1909 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1910 add_dash_mpd_pr(player_response)
9d9314cb
U
1911 else:
1912 raise ExtractorError('Video is age restricted and only playable on Youtube. Requires cookies!', expected=True)
1913 else:
1914 data = compat_urllib_parse_urlencode({
1915 'video_id': video_id,
1916 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1917 'sts': self._search_regex(
1918 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1919 })
1920 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1921 try:
1922 video_info_webpage = self._download_webpage(
1923 video_info_url, video_id,
1924 note='Refetching age-gated info webpage',
1925 errnote='unable to download video info webpage')
1926 except ExtractorError:
1927 video_info_webpage = None
1928 if video_info_webpage:
1929 video_info = compat_parse_qs(video_info_webpage)
1930 pl_response = video_info.get('player_response', [None])[0]
1931 player_response = extract_player_response(pl_response, video_id)
1932 add_dash_mpd(video_info)
1933 view_count = extract_view_count(video_info)
c108eb73
JMF
1934 else:
1935 age_gate = False
d8d24a92 1936 # Try looking directly into the video webpage
a72778d3 1937 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
59c5fa91
PO
1938 args = ytplayer_config.get("args")
1939 if args is not None:
4c76aa06 1940 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
d8d24a92
S
1941 # Convert to the same format returned by compat_parse_qs
1942 video_info = dict((k, [v]) for k, v in args.items())
1943 add_dash_mpd(video_info)
6496ccb4
S
1944 # Rental video is not rented but preview is available (e.g.
1945 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
067aa17e 1946 # https://github.com/ytdl-org/youtube-dl/issues/10532)
6496ccb4
S
1947 if not video_info and args.get('ypc_vid'):
1948 return self.url_result(
1949 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
2fe1ff85
JMF
1950 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1951 is_live = True
dbdaaa23 1952 if not player_response:
c2d125d9 1953 player_response = extract_player_response(args.get('player_response'), video_id)
59c5fa91
PO
1954 elif not player_response:
1955 player_response = ytplayer_config
0a3cf9ad 1956 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
561b456e 1957 add_dash_mpd_pr(player_response)
bbb7c3f7
YCH
1958
1959 def extract_unavailable_message():
0add33ab
S
1960 messages = []
1961 for tag, kind in (('h1', 'message'), ('div', 'submessage')):
1962 msg = self._html_search_regex(
1963 r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
1964 video_webpage, 'unavailable %s' % kind, default=None)
1965 if msg:
1966 messages.append(msg)
1967 if messages:
1968 return '\n'.join(messages)
bbb7c3f7 1969
f93abcf1 1970 if not video_info and not player_response:
15be3eb5
RA
1971 unavailable_message = extract_unavailable_message()
1972 if not unavailable_message:
1973 unavailable_message = 'Unable to extract video data'
1974 raise ExtractorError(
1975 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
1976
f93abcf1
S
1977 if not isinstance(video_info, dict):
1978 video_info = {}
1979
dbdaaa23
S
1980 video_details = try_get(
1981 player_response, lambda x: x['videoDetails'], dict) or {}
1982
37357d21
S
1983 microformat = try_get(
1984 player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}
1985
8dbf751a
RA
1986 video_title = video_info.get('title', [None])[0] or video_details.get('title')
1987 if not video_title:
cf7e015f
S
1988 self._downloader.report_warning('Unable to extract video title')
1989 video_title = '_'
1990
9cafc3fd 1991 description_original = video_description = get_element_by_id("eow-description", video_webpage)
cf7e015f 1992 if video_description:
fa4bc6e7
RA
1993
1994 def replace_url(m):
1995 redir_url = compat_urlparse.urljoin(url, m.group(1))
1996 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1997 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1998 qs = compat_parse_qs(parsed_redir_url.query)
1999 q = qs.get('q')
2000 if q and q[0]:
2001 return q[0]
2002 return redir_url
2003
9cafc3fd 2004 description_original = video_description = re.sub(r'''(?x)
cf7e015f 2005 <a\s+
25cb7a0e 2006 (?:[a-zA-Z-]+="[^"]*"\s+)*?
23f13e97 2007 (?:title|href)="([^"]+)"\s+
25cb7a0e 2008 (?:[a-zA-Z-]+="[^"]*"\s+)*?
525cedb9 2009 class="[^"]*"[^>]*>
23f13e97 2010 [^<]+\.{3}\s*
cf7e015f 2011 </a>
fa4bc6e7 2012 ''', replace_url, video_description)
cf7e015f
S
2013 video_description = clean_html(video_description)
2014 else:
ea74e00b
DP
2015 video_description = video_details.get('shortDescription')
2016 if video_description is None:
2017 video_description = self._html_search_meta('description', video_webpage)
cf7e015f 2018
8fe10494 2019 if not smuggled_data.get('force_singlefeed', False):
5e1eddb9 2020 if not self._downloader.params.get('noplaylist'):
8fe10494
S
2021 multifeed_metadata_list = try_get(
2022 player_response,
2023 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
2024 compat_str) or try_get(
2025 video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
2026 if multifeed_metadata_list:
2027 entries = []
2028 feed_ids = []
2029 for feed in multifeed_metadata_list.split(','):
2030 # Unquote should take place before split on comma (,) since textual
2031 # fields may contain comma as well (see
067aa17e 2032 # https://github.com/ytdl-org/youtube-dl/issues/8536)
8fe10494 2033 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
6b09401b
S
2034
2035 def feed_entry(name):
2036 return try_get(feed_data, lambda x: x[name][0], compat_str)
2037
2038 feed_id = feed_entry('id')
2039 if not feed_id:
2040 continue
2041 feed_title = feed_entry('title')
2042 title = video_title
2043 if feed_title:
2044 title += ' (%s)' % feed_title
8fe10494
S
2045 entries.append({
2046 '_type': 'url_transparent',
2047 'ie_key': 'Youtube',
2048 'url': smuggle_url(
2049 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
2050 {'force_singlefeed': True}),
6b09401b 2051 'title': title,
8fe10494 2052 })
6b09401b 2053 feed_ids.append(feed_id)
8fe10494
S
2054 self.to_screen(
2055 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
2056 % (', '.join(feed_ids), video_id))
2057 return self.playlist_result(entries, video_id, video_title, video_description)
2058 else:
2059 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
cf7e015f 2060
c7121fa7 2061 if view_count is None:
1c9c8de2 2062 view_count = extract_view_count(video_info)
dbdaaa23
S
2063 if view_count is None and video_details:
2064 view_count = int_or_none(video_details.get('viewCount'))
7b16239a
S
2065 if view_count is None and microformat:
2066 view_count = int_or_none(microformat.get('viewCount'))
1d699755 2067
27019dbb 2068 if is_live is None:
898238e9 2069 is_live = bool_or_none(video_details.get('isLive'))
27019dbb 2070
321bf820 2071 has_live_chat_replay = False
f0f76a33 2072 if not is_live:
321bf820 2073 yt_initial_data = self._get_yt_initial_data(video_id, video_webpage)
2074 try:
2075 yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
2076 has_live_chat_replay = True
f0f76a33 2077 except (KeyError, IndexError, TypeError):
321bf820 2078 pass
2079
c5e8d7af
PH
2080 # Check for "rental" videos
2081 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
067aa17e 2082 raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
c5e8d7af 2083
c63ca0ee
S
2084 def _extract_filesize(media_url):
2085 return int_or_none(self._search_regex(
2086 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
2087
bf1317d2
S
2088 streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
2089 streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
2090
c5e8d7af
PH
2091 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
2092 self.report_rtmp_download()
dd27fd17
PH
2093 formats = [{
2094 'format_id': '_rtmp',
2095 'protocol': 'rtmp',
2096 'url': video_info['conn'][0],
2097 'player_url': player_url,
2098 }]
bf1317d2 2099 elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
5f6a1245 2100 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 2101 if 'rtmpe%3Dyes' in encoded_url_map:
067aa17e 2102 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
bf1317d2 2103 formats = []
3318832e 2104 formats_spec = {}
82156fdb 2105 fmt_list = video_info.get('fmt_list', [''])[0]
2106 if fmt_list:
2107 for fmt in fmt_list.split(','):
2108 spec = fmt.split('/')
3318832e 2109 if len(spec) > 1:
2110 width_height = spec[1].split('x')
2111 if len(width_height) == 2:
2112 formats_spec[spec[0]] = {
2113 'resolution': spec[1],
2114 'width': int_or_none(width_height[0]),
2115 'height': int_or_none(width_height[1]),
2116 }
bf1317d2
S
2117 for fmt in streaming_formats:
2118 itag = str_or_none(fmt.get('itag'))
2119 if not itag:
201e9eaa 2120 continue
bf1317d2
S
2121 quality = fmt.get('quality')
2122 quality_label = fmt.get('qualityLabel') or quality
2123 formats_spec[itag] = {
2124 'asr': int_or_none(fmt.get('audioSampleRate')),
2125 'filesize': int_or_none(fmt.get('contentLength')),
2126 'format_note': quality_label,
2127 'fps': int_or_none(fmt.get('fps')),
2128 'height': int_or_none(fmt.get('height')),
bf1317d2
S
2129 # bitrate for itag 43 is always 2147483647
2130 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
2131 'width': int_or_none(fmt.get('width')),
2132 }
2133
2134 for fmt in streaming_formats:
00eb865b 2135 if fmt.get('drmFamilies') or fmt.get('drm_families'):
bf1317d2
S
2136 continue
2137 url = url_or_none(fmt.get('url'))
2138
2139 if not url:
fa3db383 2140 cipher = fmt.get('cipher') or fmt.get('signatureCipher')
bf1317d2
S
2141 if not cipher:
2142 continue
2143 url_data = compat_parse_qs(cipher)
2144 url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
2145 if not url:
2146 continue
2147 else:
2148 cipher = None
2149 url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2150
2f483bc1
S
2151 stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
2152 # Unsupported FORMAT_STREAM_TYPE_OTF
2153 if stream_type == 3:
2154 continue
6449cd80 2155
bf1317d2
S
2156 format_id = fmt.get('itag') or url_data['itag'][0]
2157 if not format_id:
2158 continue
2159 format_id = compat_str(format_id)
a49eccdf 2160
bf1317d2
S
2161 if cipher:
2162 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
67b19799 2163 ASSETS_RE = r'(?:"assets":.+?"js":\s*("[^"]+"))|(?:"jsUrl":\s*("[^"]+"))'
bf1317d2
S
2164 jsplayer_url_json = self._search_regex(
2165 ASSETS_RE,
2166 embed_webpage if age_gate else video_webpage,
2167 'JS player URL (1)', default=None)
2168 if not jsplayer_url_json and not age_gate:
2169 # We need the embed website after all
2170 if embed_webpage is None:
2171 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
2172 embed_webpage = self._download_webpage(
2173 embed_url, video_id, 'Downloading embed webpage')
2174 jsplayer_url_json = self._search_regex(
2175 ASSETS_RE, embed_webpage, 'JS player URL')
2176
2177 player_url = json.loads(jsplayer_url_json)
cf010131 2178 if player_url is None:
bf1317d2
S
2179 player_url_json = self._search_regex(
2180 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
2181 video_webpage, 'age gate player URL')
2182 player_url = json.loads(player_url_json)
2183
2184 if 'sig' in url_data:
2185 url += '&signature=' + url_data['sig'][0]
2186 elif 's' in url_data:
2187 encrypted_sig = url_data['s'][0]
2188
2189 if self._downloader.params.get('verbose'):
2190 if player_url is None:
bf1317d2 2191 player_desc = 'unknown'
cf010131 2192 else:
e40c758c
S
2193 player_type, player_version = self._extract_player_info(player_url)
2194 player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version)
bf1317d2
S
2195 parts_sizes = self._signature_cache_id(encrypted_sig)
2196 self.to_screen('{%s} signature length %s, %s' %
2197 (format_id, parts_sizes, player_desc))
2198
2199 signature = self._decrypt_signature(
2200 encrypted_sig, video_id, player_url, age_gate)
2201 sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
2202 url += '&%s=%s' % (sp, signature)
201e9eaa
PH
2203 if 'ratebypass' not in url:
2204 url += '&ratebypass=yes'
c9afb51c 2205
94278f72
YCH
2206 dct = {
2207 'format_id': format_id,
2208 'url': url,
2209 'player_url': player_url,
2210 }
2211 if format_id in self._formats:
2212 dct.update(self._formats[format_id])
3318832e 2213 if format_id in formats_spec:
2214 dct.update(formats_spec[format_id])
94278f72 2215
aabc2be6 2216 # Some itags are not included in DASH manifest thus corresponding formats will
067aa17e 2217 # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).
aabc2be6
S
2218 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
2219 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
2220 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
94278f72 2221
bf1317d2
S
2222 if width is None:
2223 width = int_or_none(fmt.get('width'))
2224 if height is None:
2225 height = int_or_none(fmt.get('height'))
2226
c63ca0ee
S
2227 filesize = int_or_none(url_data.get(
2228 'clen', [None])[0]) or _extract_filesize(url)
2229
bf1317d2
S
2230 quality = url_data.get('quality', [None])[0] or fmt.get('quality')
2231 quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
2232
4878759f
S
2233 tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
2234 or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
bf1317d2 2235 fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
54fc90aa 2236
94278f72 2237 more_fields = {
c63ca0ee 2238 'filesize': filesize,
bf1317d2 2239 'tbr': tbr,
c9afb51c
AH
2240 'width': width,
2241 'height': height,
bf1317d2
S
2242 'fps': fps,
2243 'format_note': quality_label or quality,
c9afb51c 2244 }
94278f72
YCH
2245 for key, value in more_fields.items():
2246 if value:
2247 dct[key] = value
bf1317d2 2248 type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
aabc2be6
S
2249 if type_:
2250 type_split = type_.split(';')
2251 kind_ext = type_split[0].split('/')
2252 if len(kind_ext) == 2:
94278f72
YCH
2253 kind, _ = kind_ext
2254 dct['ext'] = mimetype2ext(type_split[0])
aabc2be6
S
2255 if kind in ('audio', 'video'):
2256 codecs = None
2257 for mobj in re.finditer(
2258 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
2259 if mobj.group('key') == 'codecs':
2260 codecs = mobj.group('val')
2261 break
2262 if codecs:
6310acf5 2263 dct.update(parse_codecs(codecs))
e4a60912
S
2264 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
2265 dct['downloader_options'] = {
2266 # Youtube throttles chunks >~10M
2267 'http_chunk_size': 10485760,
2268 }
aabc2be6 2269 formats.append(dct)
c5e8d7af 2270 else:
c3e54389
S
2271 manifest_url = (
2272 url_or_none(try_get(
2273 player_response,
2274 lambda x: x['streamingData']['hlsManifestUrl'],
3089bc74
S
2275 compat_str))
2276 or url_or_none(try_get(
c3e54389
S
2277 video_info, lambda x: x['hlsvp'][0], compat_str)))
2278 if manifest_url:
2279 formats = []
2280 m3u8_formats = self._extract_m3u8_formats(
2281 manifest_url, video_id, 'mp4', fatal=False)
2282 for a_format in m3u8_formats:
2283 itag = self._search_regex(
2284 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
2285 if itag:
2286 a_format['format_id'] = itag
2287 if itag in self._formats:
2288 dct = self._formats[itag].copy()
2289 dct.update(a_format)
2290 a_format = dct
2291 a_format['player_url'] = player_url
2292 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
2293 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
78895bd3
U
2294 if self._downloader.params.get('youtube_include_hls_manifest', True):
2295 formats.append(a_format)
c3e54389 2296 else:
13577349 2297 error_message = extract_unavailable_message()
c3e54389 2298 if not error_message:
13577349
S
2299 error_message = clean_html(try_get(
2300 player_response, lambda x: x['playabilityStatus']['reason'],
2301 compat_str))
2302 if not error_message:
2303 error_message = clean_html(
2304 try_get(video_info, lambda x: x['reason'][0], compat_str))
c3e54389
S
2305 if error_message:
2306 raise ExtractorError(error_message, expected=True)
2307 raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 2308
7e72694b 2309 # uploader
dbdaaa23
S
2310 video_uploader = try_get(
2311 video_info, lambda x: x['author'][0],
2312 compat_str) or str_or_none(video_details.get('author'))
7e72694b
S
2313 if video_uploader:
2314 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
2315 else:
2316 self._downloader.report_warning('unable to extract uploader name')
2317
2318 # uploader_id
2319 video_uploader_id = None
2320 video_uploader_url = None
2321 mobj = re.search(
2322 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
2323 video_webpage)
2324 if mobj is not None:
2325 video_uploader_id = mobj.group('uploader_id')
2326 video_uploader_url = mobj.group('uploader_url')
a6211d23
S
2327 else:
2328 owner_profile_url = url_or_none(microformat.get('ownerProfileUrl'))
2329 if owner_profile_url:
2330 video_uploader_id = self._search_regex(
2331 r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id',
2332 default=None)
2333 video_uploader_url = owner_profile_url
7e72694b 2334
b45a9e69 2335 channel_id = (
3089bc74
S
2336 str_or_none(video_details.get('channelId'))
2337 or self._html_search_meta(
2338 'channelId', video_webpage, 'channel id', default=None)
2339 or self._search_regex(
b45a9e69 2340 r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
2341 video_webpage, 'channel id', default=None, group='id'))
dd4c4492
S
2342 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
2343
b477fc13
S
2344 thumbnails = []
2345 thumbnails_list = try_get(
2346 video_details, lambda x: x['thumbnail']['thumbnails'], list) or []
2347 for t in thumbnails_list:
2348 if not isinstance(t, dict):
2349 continue
2350 thumbnail_url = url_or_none(t.get('url'))
2351 if not thumbnail_url:
2352 continue
2353 thumbnails.append({
2354 'url': thumbnail_url,
2355 'width': int_or_none(t.get('width')),
2356 'height': int_or_none(t.get('height')),
2357 })
2358
2359 if not thumbnails:
7e72694b 2360 video_thumbnail = None
b477fc13
S
2361 # We try first to get a high quality image:
2362 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
2363 video_webpage, re.DOTALL)
2364 if m_thumb is not None:
2365 video_thumbnail = m_thumb.group(1)
2366 thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str)
2367 if thumbnail_url:
2368 video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url)
2369 if video_thumbnail:
2370 thumbnails.append({'url': video_thumbnail})
7e72694b
S
2371
2372 # upload date
2373 upload_date = self._html_search_meta(
2374 'datePublished', video_webpage, 'upload date', default=None)
2375 if not upload_date:
2376 upload_date = self._search_regex(
2377 [r'(?s)id="eow-date.*?>(.*?)</span>',
2378 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
2379 video_webpage, 'upload date', default=None)
37357d21
S
2380 if not upload_date:
2381 upload_date = microformat.get('publishDate') or microformat.get('uploadDate')
7e72694b
S
2382 upload_date = unified_strdate(upload_date)
2383
2384 video_license = self._html_search_regex(
2385 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
2386 video_webpage, 'license', default=None)
2387
2388 m_music = re.search(
2389 r'''(?x)
2390 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
2391 <ul[^>]*>\s*
2392 <li>(?P<title>.+?)
2393 by (?P<creator>.+?)
2394 (?:
2395 \(.+?\)|
2396 <a[^>]*
2397 (?:
2398 \bhref=["\']/red[^>]*>| # drop possible
2399 >\s*Listen ad-free with YouTube Red # YouTube Red ad
2400 )
2401 .*?
2402 )?</li
2403 ''',
2404 video_webpage)
2405 if m_music:
2406 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
2407 video_creator = clean_html(m_music.group('creator'))
2408 else:
2409 video_alt_title = video_creator = None
2410
2411 def extract_meta(field):
2412 return self._html_search_regex(
2413 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
2414 video_webpage, field, default=None)
2415
2416 track = extract_meta('Song')
2417 artist = extract_meta('Artist')
92bc97d3 2418 album = extract_meta('Album')
822b9d9c
RA
2419
2420 # Youtube Music Auto-generated description
92bc97d3 2421 release_date = release_year = None
822b9d9c
RA
2422 if video_description:
2423 mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description)
2424 if mobj:
2425 if not track:
2426 track = mobj.group('track').strip()
2427 if not artist:
2428 artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))
92bc97d3
RA
2429 if not album:
2430 album = mobj.group('album'.strip())
822b9d9c
RA
2431 release_year = mobj.group('release_year')
2432 release_date = mobj.group('release_date')
2433 if release_date:
2434 release_date = release_date.replace('-', '')
2435 if not release_year:
2436 release_year = int(release_date[:4])
2437 if release_year:
2438 release_year = int(release_year)
7e72694b 2439
9322f116 2440 yt_initial = self._get_yt_initial_data(video_id, video_webpage)
2441 if yt_initial:
2442 music_metadata = self._get_music_metadata_from_yt_initial(yt_initial)
2443 if len(music_metadata):
2444 album = music_metadata[0].get('album')
2445 artist = music_metadata[0].get('artist')
2446 track = music_metadata[0].get('track')
2447
7e72694b
S
2448 m_episode = re.search(
2449 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
2450 video_webpage)
2451 if m_episode:
c2dd2dc0 2452 series = unescapeHTML(m_episode.group('series'))
7e72694b
S
2453 season_number = int(m_episode.group('season'))
2454 episode_number = int(m_episode.group('episode'))
2455 else:
2456 series = season_number = episode_number = None
2457
2458 m_cat_container = self._search_regex(
2459 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
2460 video_webpage, 'categories', default=None)
dbeafce5 2461 category = None
7e72694b
S
2462 if m_cat_container:
2463 category = self._html_search_regex(
2464 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
2465 default=None)
dbeafce5
S
2466 if not category:
2467 category = try_get(
2468 microformat, lambda x: x['category'], compat_str)
2469 video_categories = None if category is None else [category]
7e72694b
S
2470
2471 video_tags = [
2472 unescapeHTML(m.group('content'))
2473 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
dbeafce5
S
2474 if not video_tags:
2475 video_tags = try_get(video_details, lambda x: x['keywords'], list)
7e72694b
S
2476
2477 def _extract_count(count_name):
2478 return str_to_int(self._search_regex(
a6c666d0 2479 r'"accessibilityData":\{"label":"([\d,\w]+) %ss"\}'
7e72694b
S
2480 % re.escape(count_name),
2481 video_webpage, count_name, default=None))
2482
2483 like_count = _extract_count('like')
2484 dislike_count = _extract_count('dislike')
2485
dbdaaa23
S
2486 if view_count is None:
2487 view_count = str_to_int(self._search_regex(
2488 r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
2489 'view count', default=None))
2490
bf3c9326
S
2491 average_rating = (
2492 float_or_none(video_details.get('averageRating'))
2493 or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
2494
7e72694b 2495 # subtitles
321bf820 2496 video_subtitles = self.extract_subtitles(
2497 video_id, video_webpage, has_live_chat_replay)
7e72694b
S
2498 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
2499
2500 video_duration = try_get(
2501 video_info, lambda x: int_or_none(x['length_seconds'][0]))
dbdaaa23
S
2502 if not video_duration:
2503 video_duration = int_or_none(video_details.get('lengthSeconds'))
7e72694b
S
2504 if not video_duration:
2505 video_duration = parse_duration(self._html_search_meta(
2506 'duration', video_webpage, 'video duration'))
2507
b84071c0
JP
2508 # Get Subscriber Count of channel
2509 subscriber_count = parse_count(self._search_regex(
2510 r'"text":"([\d\.]+\w?) subscribers"',
2511 video_webpage,
2512 'subscriber count',
2513 default=None
2514 ))
2515
7e72694b
S
2516 # annotations
2517 video_annotations = None
2518 if self._downloader.params.get('writeannotations', False):
64b6a4e9
RA
2519 xsrf_token = self._search_regex(
2520 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2',
2521 video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
2522 invideo_url = try_get(
2523 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
2524 if xsrf_token and invideo_url:
2525 xsrf_field_name = self._search_regex(
2526 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
2527 video_webpage, 'xsrf field name',
2528 group='xsrf_field_name', default='session_token')
2529 video_annotations = self._download_webpage(
2530 self._proto_relative_url(invideo_url),
2531 video_id, note='Downloading annotations',
2532 errnote='Unable to download video annotations', fatal=False,
2533 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
7e72694b 2534
84213ea8 2535 chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
7e72694b 2536
dd27fd17 2537 # Look for the DASH manifest
203fb43f 2538 if self._downloader.params.get('youtube_include_dash_manifest', True):
77c6fb5b 2539 dash_mpd_fatal = True
8ff648e4 2540 for mpd_url in dash_mpds:
d8d24a92 2541 dash_formats = {}
774e208f 2542 try:
05d0d131
YCH
2543 def decrypt_sig(mobj):
2544 s = mobj.group(1)
2545 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2546 return '/signature/%s' % dec_s
2547
8ff648e4 2548 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2d2fa82d 2549
8ff648e4 2550 for df in self._extract_mpd_formats(
2551 mpd_url, video_id, fatal=dash_mpd_fatal,
2552 formats_dict=self._formats):
c63ca0ee
S
2553 if not df.get('filesize'):
2554 df['filesize'] = _extract_filesize(df['url'])
d8d24a92
S
2555 # Do not overwrite DASH format found in some previous DASH manifest
2556 if df['format_id'] not in dash_formats:
2557 dash_formats[df['format_id']] = df
77c6fb5b
S
2558 # Additional DASH manifests may end up in HTTP Error 403 therefore
2559 # allow them to fail without bug report message if we already have
2560 # some DASH manifest succeeded. This is temporary workaround to reduce
2561 # burst of bug reports until we figure out the reason and whether it
2562 # can be fixed at all.
2563 dash_mpd_fatal = False
774e208f
PH
2564 except (ExtractorError, KeyError) as e:
2565 self.report_warning(
2566 'Skipping DASH manifest: %r' % e, video_id)
d8d24a92 2567 if dash_formats:
04b3b3df
JMF
2568 # Remove the formats we found through non-DASH, they
2569 # contain less info and it can be wrong, because we use
2570 # fixed values (for example the resolution). See
067aa17e 2571 # https://github.com/ytdl-org/youtube-dl/issues/5774 for an
04b3b3df 2572 # example.
d80265cc 2573 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
d8d24a92 2574 formats.extend(dash_formats.values())
d80044c2 2575
6271f1ca
PH
2576 # Check for malformed aspect ratio
2577 stretched_m = re.search(
2578 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2579 video_webpage)
2580 if stretched_m:
313dfc45
LL
2581 w = float(stretched_m.group('w'))
2582 h = float(stretched_m.group('h'))
5faf9fed
S
2583 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2584 # We will only process correct ratios.
313dfc45 2585 if w > 0 and h > 0:
41f24c32 2586 ratio = w / h
313dfc45
LL
2587 for f in formats:
2588 if f.get('vcodec') != 'none':
2589 f['stretched_ratio'] = ratio
6271f1ca 2590
026fbedc 2591 if not formats:
43ebf77d
S
2592 if 'reason' in video_info:
2593 if 'The uploader has not made this video available in your country.' in video_info['reason']:
2594 regions_allowed = self._html_search_meta(
2595 'regionsAllowed', video_webpage, default=None)
2596 countries = regions_allowed.split(',') if regions_allowed else None
2597 self.raise_geo_restricted(
2598 msg=video_info['reason'][0], countries=countries)
2599 reason = video_info['reason'][0]
2600 if 'Invalid parameters' in reason:
2601 unavailable_message = extract_unavailable_message()
2602 if unavailable_message:
2603 reason = unavailable_message
2604 raise ExtractorError(
2605 'YouTube said: %s' % reason,
2606 expected=True, video_id=video_id)
2607 if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']):
2608 raise ExtractorError('This video is DRM protected.', expected=True)
0d297518 2609
4bcc7bd1 2610 self._sort_formats(formats)
4ea3be0a 2611
21c340b8 2612 self.mark_watched(video_id, video_info, player_response)
d77ab8e2 2613
4ea3be0a 2614 return {
8bcc8756
JW
2615 'id': video_id,
2616 'uploader': video_uploader,
2617 'uploader_id': video_uploader_id,
fd050249 2618 'uploader_url': video_uploader_url,
dd4c4492
S
2619 'channel_id': channel_id,
2620 'channel_url': channel_url,
8bcc8756 2621 'upload_date': upload_date,
7caf9830 2622 'license': video_license,
936784b2 2623 'creator': video_creator or artist,
8bcc8756 2624 'title': video_title,
936784b2 2625 'alt_title': video_alt_title or track,
b477fc13 2626 'thumbnails': thumbnails,
8bcc8756
JW
2627 'description': video_description,
2628 'categories': video_categories,
000b6b5a 2629 'tags': video_tags,
8bcc8756 2630 'subtitles': video_subtitles,
360e1ca5 2631 'automatic_captions': automatic_captions,
8bcc8756
JW
2632 'duration': video_duration,
2633 'age_limit': 18 if age_gate else 0,
2634 'annotations': video_annotations,
9cafc3fd 2635 'chapters': chapters,
7e8c0af0 2636 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 2637 'view_count': view_count,
4ea3be0a 2638 'like_count': like_count,
2639 'dislike_count': dislike_count,
bf3c9326 2640 'average_rating': average_rating,
8bcc8756 2641 'formats': formats,
2fe1ff85 2642 'is_live': is_live,
7c80519c 2643 'start_time': start_time,
297a564b 2644 'end_time': end_time,
12afdc2a
S
2645 'series': series,
2646 'season_number': season_number,
2647 'episode_number': episode_number,
936784b2
S
2648 'track': track,
2649 'artist': artist,
5caabd3c 2650 'album': album,
2651 'release_date': release_date,
2652 'release_year': release_year,
b84071c0 2653 'subscriber_count': subscriber_count,
4ea3be0a 2654 }
c5e8d7af 2655
5f6a1245 2656
8e7aad20 2657class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
78caa52a 2658 IE_DESC = 'YouTube.com playlists'
d67cc9fa 2659 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
2660 (?:https?://)?
2661 (?:\w+\.)?
c5e8d7af 2662 (?:
c0345b82 2663 (?:
66b48727 2664 youtube(?:kids)?\.com|
c0345b82
S
2665 invidio\.us
2666 )
2667 /
feaa5ad7 2668 (?:
87dadd45 2669 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))
feaa5ad7
S
2670 \? (?:.*?[&;])*? (?:p|a|list)=
2671 | p/
2672 )|
2673 youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
c5e8d7af 2674 )
d67cc9fa 2675 (
66b48727 2676 (?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)?[0-9A-Za-z-_]{10,}
5f6a1245 2677 # Top tracks, they can also include dots
d67cc9fa
JMF
2678 |(?:MC)[\w\.]*
2679 )
c5e8d7af
PH
2680 .*
2681 |
d0ba5587
S
2682 (%(playlist_id)s)
2683 )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
8d81f3e3 2684 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
351f37c0
S
2685 _VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&amp;(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?'
2686 _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})'
78caa52a 2687 IE_NAME = 'youtube:playlist'
7f4f0b21 2688 _YTM_PLAYLIST_PREFIX = 'RDCLAK5uy_'
2689 _YTM_CHANNEL_INFO = {
2690 'uploader': 'Youtube Music',
2691 'uploader_id': 'music', # or "UC-9-kyTW8ZkZNDHQJ6FgpwQ"
2692 'uploader_url': 'https://www.youtube.com/music'
2693 }
81127aa5 2694 _TESTS = [{
0e30a7b9 2695 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
81127aa5 2696 'info_dict': {
0e30a7b9 2697 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2698 'uploader': 'Sergey M.',
2699 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3867038a 2700 'title': 'youtube-dl public playlist',
81127aa5 2701 },
0e30a7b9 2702 'playlist_count': 1,
9291475f 2703 }, {
0e30a7b9 2704 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
9291475f 2705 'info_dict': {
0e30a7b9 2706 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2707 'uploader': 'Sergey M.',
2708 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
3867038a 2709 'title': 'youtube-dl empty playlist',
9291475f
PH
2710 },
2711 'playlist_count': 0,
2712 }, {
2713 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2714 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2715 'info_dict': {
2716 'title': '29C3: Not my department',
acf757f4 2717 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
13a75688
S
2718 'uploader': 'Christiaan008',
2719 'uploader_id': 'ChRiStIaAn008',
9291475f 2720 },
0e30a7b9 2721 'playlist_count': 96,
9291475f
PH
2722 }, {
2723 'note': 'issue #673',
2724 'url': 'PLBB231211A4F62143',
2725 'info_dict': {
f46a8702 2726 'title': '[OLD]Team Fortress 2 (Class-based LP)',
acf757f4 2727 'id': 'PLBB231211A4F62143',
13a75688
S
2728 'uploader': 'Wickydoo',
2729 'uploader_id': 'Wickydoo',
9291475f
PH
2730 },
2731 'playlist_mincount': 26,
2732 }, {
2733 'note': 'Large playlist',
2734 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
2735 'info_dict': {
2736 'title': 'Uploads from Cauchemar',
acf757f4 2737 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
13a75688
S
2738 'uploader': 'Cauchemar',
2739 'uploader_id': 'Cauchemar89',
9291475f
PH
2740 },
2741 'playlist_mincount': 799,
2742 }, {
2743 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2744 'info_dict': {
2745 'title': 'YDL_safe_search',
acf757f4 2746 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
9291475f
PH
2747 },
2748 'playlist_count': 2,
4201ba13 2749 'skip': 'This playlist is private',
ac7553d0
PH
2750 }, {
2751 'note': 'embedded',
2d3d2997 2752 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
ac7553d0
PH
2753 'playlist_count': 4,
2754 'info_dict': {
2755 'title': 'JODA15',
acf757f4 2756 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
13a75688
S
2757 'uploader': 'milan',
2758 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
ac7553d0 2759 }
87dadd45
S
2760 }, {
2761 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2762 'playlist_mincount': 485,
2763 'info_dict': {
13a75688 2764 'title': '2018 Chinese New Singles (11/6 updated)',
87dadd45 2765 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
13a75688
S
2766 'uploader': 'LBK',
2767 'uploader_id': 'sdragonfang',
87dadd45 2768 }
6b08cdf6
PH
2769 }, {
2770 'note': 'Embedded SWF player',
2d3d2997 2771 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
6b08cdf6
PH
2772 'playlist_count': 4,
2773 'info_dict': {
2774 'title': 'JODA7',
acf757f4 2775 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
13a75688
S
2776 },
2777 'skip': 'This playlist does not exist',
4b7df0d3
JMF
2778 }, {
2779 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2780 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2781 'info_dict': {
acf757f4
PH
2782 'title': 'Uploads from Interstellar Movie',
2783 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
13a75688
S
2784 'uploader': 'Interstellar Movie',
2785 'uploader_id': 'InterstellarMovie1',
4b7df0d3 2786 },
481cc733 2787 'playlist_mincount': 21,
dacb3a86
S
2788 }, {
2789 # Playlist URL that does not actually serve a playlist
2790 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2791 'info_dict': {
2792 'id': 'FqZTN594JQw',
2793 'ext': 'webm',
2794 'title': "Smiley's People 01 detective, Adventure Series, Action",
2795 'uploader': 'STREEM',
2796 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
ec85ded8 2797 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
dacb3a86
S
2798 'upload_date': '20150526',
2799 'license': 'Standard YouTube License',
2800 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2801 'categories': ['People & Blogs'],
2802 'tags': list,
dbdaaa23 2803 'view_count': int,
dacb3a86
S
2804 'like_count': int,
2805 'dislike_count': int,
2806 },
2807 'params': {
2808 'skip_download': True,
2809 },
13a75688 2810 'skip': 'This video is not available.',
dacb3a86 2811 'add_ie': [YoutubeIE.ie_key()],
481cc733
S
2812 }, {
2813 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
2814 'info_dict': {
2815 'id': 'yeWKywCrFtk',
2816 'ext': 'mp4',
2817 'title': 'Small Scale Baler and Braiding Rugs',
2818 'uploader': 'Backus-Page House Museum',
2819 'uploader_id': 'backuspagemuseum',
ec85ded8 2820 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
481cc733 2821 'upload_date': '20161008',
481cc733
S
2822 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
2823 'categories': ['Nonprofits & Activism'],
2824 'tags': list,
2825 'like_count': int,
2826 'dislike_count': int,
2827 },
2828 'params': {
2829 'noplaylist': True,
2830 'skip_download': True,
2831 },
2e18adec
S
2832 }, {
2833 # https://github.com/ytdl-org/youtube-dl/issues/21844
2834 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2835 'info_dict': {
2836 'title': 'Data Analysis with Dr Mike Pound',
2837 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2838 'uploader_id': 'Computerphile',
2839 'uploader': 'Computerphile',
2840 },
2841 'playlist_mincount': 11,
feaa5ad7
S
2842 }, {
2843 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
2844 'only_matching': True,
a6857510
S
2845 }, {
2846 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
2847 'only_matching': True,
409b9324
S
2848 }, {
2849 # music album playlist
2850 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
2851 'only_matching': True,
c0345b82
S
2852 }, {
2853 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',
2854 'only_matching': True,
66b48727
RA
2855 }, {
2856 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
2857 'only_matching': True,
81127aa5 2858 }]
c5e8d7af 2859
880e1c52
JMF
2860 def _real_initialize(self):
2861 self._login()
2862
351f37c0
S
2863 def extract_videos_from_page(self, page):
2864 ids_in_page = []
2865 titles_in_page = []
2866
2867 for item in re.findall(
2868 r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page):
2869 attrs = extract_attributes(item)
2870 video_id = attrs['data-video-id']
2871 video_title = unescapeHTML(attrs.get('data-title'))
2872 if video_title:
2873 video_title = video_title.strip()
2874 ids_in_page.append(video_id)
2875 titles_in_page.append(video_title)
2876
2877 # Fallback with old _VIDEO_RE
2878 self.extract_videos_from_page_impl(
2879 self._VIDEO_RE, page, ids_in_page, titles_in_page)
2880
2881 # Relaxed fallbacks
2882 self.extract_videos_from_page_impl(
2883 r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page,
2884 ids_in_page, titles_in_page)
2885 self.extract_videos_from_page_impl(
2886 r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page,
2887 ids_in_page, titles_in_page)
2888
2889 return zip(ids_in_page, titles_in_page)
2890
5b0a6a80 2891 def _extract_mix_ids_from_yt_initial(self, yt_initial):
2892 ids = []
5c15c1a0 2893 playlist_contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist']['contents'], list)
2894 if playlist_contents:
5b0a6a80 2895 for item in playlist_contents:
5c15c1a0 2896 videoId = try_get(item, lambda x: x['playlistPanelVideoRenderer']['videoId'], compat_str)
2897 if videoId:
5b0a6a80 2898 ids.append(videoId)
2899 return ids
2900
652cdaa2 2901 def _extract_mix(self, playlist_id):
99209c29 2902 # The mixes are generated from a single video
652cdaa2 2903 # the id of the playlist is just 'RD' + video_id
1b6182d8 2904 ids = []
15f6397c 2905 yt_initial = None
1b6182d8
JMF
2906 last_id = playlist_id[-11:]
2907 for n in itertools.count(1):
07af16b9 2908 url = 'https://www.youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
1b6182d8
JMF
2909 webpage = self._download_webpage(
2910 url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))
2911 new_ids = orderedSet(re.findall(
2912 r'''(?xs)data-video-username=".*?".*?
2913 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
2914 webpage))
5b0a6a80 2915
2916 # if no ids in html of page, try using embedded json
2917 if (len(new_ids) == 0):
2918 yt_initial = self._get_yt_initial_data(playlist_id, webpage)
2919 if yt_initial:
2920 new_ids = self._extract_mix_ids_from_yt_initial(yt_initial)
2921
1b6182d8
JMF
2922 # Fetch new pages until all the videos are repeated, it seems that
2923 # there are always 51 unique videos.
2924 new_ids = [_id for _id in new_ids if _id not in ids]
2925 if not new_ids:
2926 break
2927 ids.extend(new_ids)
2928 last_id = ids[-1]
2929
2930 url_results = self._ids_to_results(ids)
2931
bc2f773b 2932 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5 2933 title_span = (
3089bc74
S
2934 search_title('playlist-title')
2935 or search_title('title long-title')
2936 or search_title('title'))
76d1700b 2937 title = clean_html(title_span)
652cdaa2 2938
15f6397c 2939 if not title:
2940 title = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist']['title'], compat_str)
2941
652cdaa2
JMF
2942 return self.playlist_result(url_results, playlist_id, title)
2943
448830ce 2944 def _extract_playlist(self, playlist_id):
dbb94fb0
S
2945 url = self._TEMPLATE_URL % playlist_id
2946 page = self._download_webpage(url, playlist_id)
dbb94fb0 2947
067aa17e 2948 # the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604)
8bc0800d 2949 for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page):
39b62db1
YCH
2950 match = match.strip()
2951 # Check if the playlist exists or is private
4201ba13
S
2952 mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match)
2953 if mobj:
2954 reason = mobj.group('reason')
2955 message = 'This playlist %s' % reason
2956 if 'private' in reason:
2957 message += ', use --username or --netrc to access it'
2958 message += '.'
2959 raise ExtractorError(message, expected=True)
39b62db1
YCH
2960 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
2961 raise ExtractorError(
2962 'Invalid parameters. Maybe URL is incorrect.',
2963 expected=True)
2964 elif re.match(r'[^<]*Choose your language[^<]*', match):
2965 continue
2966 else:
2967 self.report_warning('Youtube gives an alert message: ' + match)
10c0e2d8 2968
dbb94fb0 2969 playlist_title = self._html_search_regex(
63b4295d 2970 r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
dacb3a86 2971 page, 'title', default=None)
c5e8d7af 2972
07aeced6 2973 _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref='
4e3f1f04 2974 uploader = self._html_search_regex(
07aeced6
S
2975 r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,
2976 page, 'uploader', default=None)
2977 mobj = re.search(
2978 r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE,
2979 page)
2980 if mobj:
2981 uploader_id = mobj.group('uploader_id')
2982 uploader_url = compat_urlparse.urljoin(url, mobj.group('path'))
2983 else:
2984 uploader_id = uploader_url = None
2985
dacb3a86
S
2986 has_videos = True
2987
2988 if not playlist_title:
2989 try:
2990 # Some playlist URLs don't actually serve a playlist (e.g.
2991 # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)
2992 next(self._entries(page, playlist_id))
2993 except StopIteration:
2994 has_videos = False
2995
07aeced6 2996 playlist = self.playlist_result(
dacb3a86 2997 self._entries(page, playlist_id), playlist_id, playlist_title)
07aeced6
S
2998 playlist.update({
2999 'uploader': uploader,
3000 'uploader_id': uploader_id,
3001 'uploader_url': uploader_url,
3002 })
7f4f0b21 3003 if playlist_id.startswith(self._YTM_PLAYLIST_PREFIX):
3004 playlist.update(self._YTM_CHANNEL_INFO)
07aeced6
S
3005
3006 return has_videos, playlist
c5e8d7af 3007
ebf1b291 3008 def _check_download_just_video(self, url, playlist_id):
448830ce
S
3009 # Check if it's a video-specific URL
3010 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
481cc733 3011 video_id = query_dict.get('v', [None])[0] or self._search_regex(
87dadd45 3012 r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url,
481cc733
S
3013 'video id', default=None)
3014 if video_id:
448830ce
S
3015 if self._downloader.params.get('noplaylist'):
3016 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
dacb3a86 3017 return video_id, self.url_result(video_id, 'Youtube', video_id=video_id)
448830ce
S
3018 else:
3019 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
dacb3a86
S
3020 return video_id, None
3021 return None, None
448830ce 3022
ebf1b291
S
3023 def _real_extract(self, url):
3024 # Extract playlist id
3025 mobj = re.match(self._VALID_URL, url)
3026 if mobj is None:
3027 raise ExtractorError('Invalid URL: %s' % url)
3028 playlist_id = mobj.group(1) or mobj.group(2)
3029
dacb3a86 3030 video_id, video = self._check_download_just_video(url, playlist_id)
ebf1b291
S
3031 if video:
3032 return video
3033
466a6145 3034 if playlist_id.startswith(('RD', 'UL', 'PU')):
7f4f0b21 3035 if not playlist_id.startswith(self._YTM_PLAYLIST_PREFIX):
659ddd7f 3036 # Mixes require a custom extraction process,
3037 # Youtube Music playlists act like normal playlists (with randomized order)
3038 return self._extract_mix(playlist_id)
448830ce 3039
dacb3a86
S
3040 has_videos, playlist = self._extract_playlist(playlist_id)
3041 if has_videos or not video_id:
3042 return playlist
3043
3044 # Some playlist URLs don't actually serve a playlist (see
067aa17e 3045 # https://github.com/ytdl-org/youtube-dl/issues/10537).
dacb3a86
S
3046 # Fallback to plain video extraction if there is a video id
3047 # along with playlist id.
3048 return self.url_result(video_id, 'Youtube', video_id=video_id)
448830ce 3049
c5e8d7af 3050
648e6a1f 3051class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
78caa52a 3052 IE_DESC = 'YouTube.com channels'
66b48727 3053 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie|kids)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)'
eb0f3e7e 3054 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
648e6a1f 3055 _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
78caa52a 3056 IE_NAME = 'youtube:channel'
cdc628a4
PH
3057 _TESTS = [{
3058 'note': 'paginated channel',
3059 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
3060 'playlist_mincount': 91,
acf757f4 3061 'info_dict': {
9170ca5b
JMF
3062 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
3063 'title': 'Uploads from lex will',
13a75688
S
3064 'uploader': 'lex will',
3065 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
acf757f4 3066 }
5c43afd4
JMF
3067 }, {
3068 'note': 'Age restricted channel',
3069 # from https://www.youtube.com/user/DeusExOfficial
3070 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',
3071 'playlist_mincount': 64,
3072 'info_dict': {
3073 'id': 'UUs0ifCMCm1icqRbqhUINa0w',
3074 'title': 'Uploads from Deus Ex',
13a75688
S
3075 'uploader': 'Deus Ex',
3076 'uploader_id': 'DeusExOfficial',
5c43afd4 3077 },
cd5a74a2
S
3078 }, {
3079 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',
3080 'only_matching': True,
66b48727
RA
3081 }, {
3082 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA',
3083 'only_matching': True,
cdc628a4 3084 }]
c5e8d7af 3085
e462474e
S
3086 @classmethod
3087 def suitable(cls, url):
f07e276a
S
3088 return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)
3089 else super(YoutubeChannelIE, cls).suitable(url))
e462474e 3090
9558dcec
S
3091 def _build_template_url(self, url, channel_id):
3092 return self._TEMPLATE_URL % channel_id
3093
c5e8d7af 3094 def _real_extract(self, url):
9ff67727 3095 channel_id = self._match_id(url)
c5e8d7af 3096
9558dcec 3097 url = self._build_template_url(url, channel_id)
386bdfa6
S
3098
3099 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
3100 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
3101 # otherwise fallback on channel by page extraction
3102 channel_page = self._download_webpage(
3103 url + '?view=57', channel_id,
3104 'Downloading channel page', fatal=False)
2b3c2546
PH
3105 if channel_page is False:
3106 channel_playlist_id = False
3107 else:
3108 channel_playlist_id = self._html_search_meta(
3109 'channelId', channel_page, 'channel id', default=None)
3110 if not channel_playlist_id:
73c4ac2c
S
3111 channel_url = self._html_search_meta(
3112 ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),
3113 channel_page, 'channel url', default=None)
3114 if channel_url:
3115 channel_playlist_id = self._search_regex(
3116 r'vnd\.youtube://user/([0-9A-Za-z_-]+)',
3117 channel_url, 'channel id', default=None)
386bdfa6
S
3118 if channel_playlist_id and channel_playlist_id.startswith('UC'):
3119 playlist_id = 'UU' + channel_playlist_id[2:]
d2a9de78
IK
3120 return self.url_result(
3121 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
386bdfa6 3122
60bf45c8 3123 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
31812a9e
PH
3124 autogenerated = re.search(r'''(?x)
3125 class="[^"]*?(?:
3126 channel-header-autogenerated-label|
3127 yt-channel-title-autogenerated
3128 )[^"]*"''', channel_page) is not None
c5e8d7af 3129
b9643eed
JMF
3130 if autogenerated:
3131 # The videos are contained in a single page
3132 # the ajax pages can't be used, they are empty
b82f815f 3133 entries = [
fb69240c
S
3134 self.url_result(
3135 video_id, 'Youtube', video_id=video_id,
3136 video_title=video_title)
8f02ad4f 3137 for video_id, video_title in self.extract_videos_from_page(channel_page)]
b82f815f
PH
3138 return self.playlist_result(entries, channel_id)
3139
73c4ac2c
S
3140 try:
3141 next(self._entries(channel_page, channel_id))
3142 except StopIteration:
3143 alert_message = self._html_search_regex(
3144 r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',
3145 channel_page, 'alert', default=None, group='alert')
3146 if alert_message:
3147 raise ExtractorError('Youtube said: %s' % alert_message, expected=True)
3148
648e6a1f 3149 return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
c5e8d7af
PH
3150
3151
eb0f3e7e 3152class YoutubeUserIE(YoutubeChannelIE):
78caa52a 3153 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
39e7107d 3154 _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9%-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_%-]+)'
9558dcec 3155 _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
78caa52a 3156 IE_NAME = 'youtube:user'
c5e8d7af 3157
cdc628a4
PH
3158 _TESTS = [{
3159 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
3160 'playlist_mincount': 320,
3161 'info_dict': {
73c4ac2c
S
3162 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
3163 'title': 'Uploads from The Linux Foundation',
13a75688
S
3164 'uploader': 'The Linux Foundation',
3165 'uploader_id': 'TheLinuxFoundation',
cdc628a4 3166 }
9558dcec
S
3167 }, {
3168 # Only available via https://www.youtube.com/c/12minuteathlete/videos
3169 # but not https://www.youtube.com/user/12minuteathlete/videos
3170 'url': 'https://www.youtube.com/c/12minuteathlete/videos',
3171 'playlist_mincount': 249,
3172 'info_dict': {
3173 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
3174 'title': 'Uploads from 12 Minute Athlete',
13a75688
S
3175 'uploader': '12 Minute Athlete',
3176 'uploader_id': 'the12minuteathlete',
9558dcec 3177 }
cdc628a4
PH
3178 }, {
3179 'url': 'ytuser:phihag',
3180 'only_matching': True,
daa0df9e
YCH
3181 }, {
3182 'url': 'https://www.youtube.com/c/gametrailers',
3183 'only_matching': True,
39e7107d
U
3184 }, {
3185 'url': 'https://www.youtube.com/c/Pawe%C5%82Zadro%C5%BCniak',
3186 'only_matching': True,
9558dcec
S
3187 }, {
3188 'url': 'https://www.youtube.com/gametrailers',
3189 'only_matching': True,
73c4ac2c 3190 }, {
0e879f43 3191 # This channel is not available, geo restricted to JP
73c4ac2c
S
3192 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',
3193 'only_matching': True,
cdc628a4
PH
3194 }]
3195
e3ea4790 3196 @classmethod
f4b05232 3197 def suitable(cls, url):
e3ea4790
JMF
3198 # Don't return True if the url can be extracted with other youtube
3199 # extractor, the regex would is too permissive and it would match.
f3a58d46 3200 other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)
3201 if any(ie.suitable(url) for ie in other_yt_ies):
5f6a1245
JW
3202 return False
3203 else:
3204 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 3205
9558dcec
S
3206 def _build_template_url(self, url, channel_id):
3207 mobj = re.match(self._VALID_URL, url)
3208 return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))
3209
b05654f0 3210
f07e276a
S
3211class YoutubeLiveIE(YoutubeBaseInfoExtractor):
3212 IE_DESC = 'YouTube.com live streams'
073d5bf5 3213 _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'
f07e276a
S
3214 IE_NAME = 'youtube:live'
3215
3216 _TESTS = [{
2d3d2997 3217 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
f07e276a
S
3218 'info_dict': {
3219 'id': 'a48o2S1cPoo',
3220 'ext': 'mp4',
3221 'title': 'The Young Turks - Live Main Show',
3222 'uploader': 'The Young Turks',
3223 'uploader_id': 'TheYoungTurks',
ec85ded8 3224 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
f07e276a
S
3225 'upload_date': '20150715',
3226 'license': 'Standard YouTube License',
3227 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
3228 'categories': ['News & Politics'],
3229 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
3230 'like_count': int,
3231 'dislike_count': int,
3232 },
3233 'params': {
3234 'skip_download': True,
3235 },
3236 }, {
2d3d2997 3237 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
f07e276a 3238 'only_matching': True,
c1b2a085
S
3239 }, {
3240 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
3241 'only_matching': True,
073d5bf5
S
3242 }, {
3243 'url': 'https://www.youtube.com/TheYoungTurks/live',
3244 'only_matching': True,
f07e276a
S
3245 }]
3246
3247 def _real_extract(self, url):
3248 mobj = re.match(self._VALID_URL, url)
3249 channel_id = mobj.group('id')
3250 base_url = mobj.group('base_url')
3251 webpage = self._download_webpage(url, channel_id, fatal=False)
3252 if webpage:
3253 page_type = self._og_search_property(
e7f3529f 3254 'type', webpage, 'page type', default='')
f07e276a
S
3255 video_id = self._html_search_meta(
3256 'videoId', webpage, 'video id', default=None)
e7f3529f
S
3257 if page_type.startswith('video') and video_id and re.match(
3258 r'^[0-9A-Za-z_-]{11}$', video_id):
f07e276a
S
3259 return self.url_result(video_id, YoutubeIE.ie_key())
3260 return self.url_result(base_url)
3261
3262
e462474e
S
3263class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
3264 IE_DESC = 'YouTube.com user/channel playlists'
e942cfd1 3265 _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel|c)/(?P<id>[^/]+)/playlists'
e462474e 3266 IE_NAME = 'youtube:playlists'
0c148415 3267
e568c223 3268 _TESTS = [{
2d3d2997 3269 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
0c148415
S
3270 'playlist_mincount': 4,
3271 'info_dict': {
3272 'id': 'ThirstForScience',
13a75688 3273 'title': 'ThirstForScience',
0c148415 3274 },
e568c223
S
3275 }, {
3276 # with "Load more" button
2d3d2997 3277 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
e568c223
S
3278 'playlist_mincount': 70,
3279 'info_dict': {
3280 'id': 'igorkle1',
3281 'title': 'Игорь Клейнер',
3282 },
e462474e
S
3283 }, {
3284 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',
3285 'playlist_mincount': 17,
3286 'info_dict': {
3287 'id': 'UCiU1dHvZObB2iP6xkJ__Icw',
3288 'title': 'Chem Player',
3289 },
13a75688 3290 'skip': 'Blocked',
e942cfd1
S
3291 }, {
3292 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
3293 'only_matching': True,
e568c223 3294 }]
0c148415
S
3295
3296
9833e7a0 3297class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistBaseInfoExtractor):
78caa52a 3298 IE_DESC = 'YouTube.com searches'
b4c08069
JMF
3299 # there doesn't appear to be a real limit, for example if you search for
3300 # 'python' you get more than 8.000.000 results
3301 _MAX_RESULTS = float('inf')
78caa52a 3302 IE_NAME = 'youtube:search'
b05654f0 3303 _SEARCH_KEY = 'ytsearch'
6c894ea1 3304 _SEARCH_PARAMS = None
9dd8e46a 3305 _TESTS = []
b05654f0 3306
6c894ea1
U
3307 def _entries(self, query, n):
3308 data = {
3309 'context': {
3310 'client': {
3311 'clientName': 'WEB',
3312 'clientVersion': '2.20201021.03.00',
3313 }
3314 },
3315 'query': query,
a22b2fd1 3316 }
6c894ea1
U
3317 if self._SEARCH_PARAMS:
3318 data['params'] = self._SEARCH_PARAMS
3319 total = 0
3320 for page_num in itertools.count(1):
3321 search = self._download_json(
3322 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
3323 video_id='query "%s"' % query,
3324 note='Downloading page %s' % page_num,
3325 errnote='Unable to download API page', fatal=False,
3326 data=json.dumps(data).encode('utf8'),
3327 headers={'content-type': 'application/json'})
3328 if not search:
b4c08069 3329 break
6c894ea1
U
3330 slr_contents = try_get(
3331 search,
3332 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
3333 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
3334 list)
3335 if not slr_contents:
a22b2fd1 3336 break
6c894ea1
U
3337 isr_contents = try_get(
3338 slr_contents,
3339 lambda x: x[0]['itemSectionRenderer']['contents'],
3340 list)
3341 if not isr_contents:
3342 break
3343 for content in isr_contents:
3344 if not isinstance(content, dict):
3345 continue
3346 video = content.get('videoRenderer')
3347 if not isinstance(video, dict):
3348 continue
3349 video_id = video.get('videoId')
3350 if not video_id:
3351 continue
3352 title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str)
3353 description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str)
3354 duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str))
3355 view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
3356 view_count = int_or_none(self._search_regex(
3357 r'^(\d+)', re.sub(r'\s', '', view_count_text),
3358 'view count', default=None))
3359 uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
3360 total += 1
3361 yield {
3362 '_type': 'url_transparent',
3363 'ie_key': YoutubeIE.ie_key(),
3364 'id': video_id,
3365 'url': video_id,
3366 'title': title,
3367 'description': description,
3368 'duration': duration,
3369 'view_count': view_count,
3370 'uploader': uploader,
3371 }
3372 if total == n:
3373 return
3374 token = try_get(
3375 slr_contents,
3376 lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
3377 compat_str)
3378 if not token:
3379 break
3380 data['continuation'] = token
b05654f0 3381
6c894ea1
U
3382 def _get_n_results(self, query, n):
3383 """Get a specified number of results for a query"""
3384 return self.playlist_result(self._entries(query, n), query)
75dff0ee 3385
c9ae7b95 3386
a3dd9248 3387class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 3388 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248 3389 _SEARCH_KEY = 'ytsearchdate'
78caa52a 3390 IE_DESC = 'YouTube.com searches, newest videos first'
6c894ea1 3391 _SEARCH_PARAMS = 'CAI%3D'
75dff0ee 3392
c9ae7b95 3393
9833e7a0 3394class YoutubeSearchURLIE(YoutubePlaylistBaseInfoExtractor):
78caa52a
PH
3395 IE_DESC = 'YouTube.com search URLs'
3396 IE_NAME = 'youtube:search_url'
d2c1f79f 3397 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4 3398 _TESTS = [{
3867038a 3399 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
cdc628a4
PH
3400 'playlist_mincount': 5,
3401 'info_dict': {
3867038a 3402 'title': 'youtube-dl test video',
cdc628a4 3403 }
d2c1f79f
S
3404 }, {
3405 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
3406 'only_matching': True,
cdc628a4 3407 }]
c9ae7b95 3408
9833e7a0
LR
3409 def _process_json_dict(self, obj, videos, c):
3410 if "videoId" in obj:
3411 videos.append(obj)
3412 return
e03b4f3e 3413
9833e7a0
LR
3414 if "nextContinuationData" in obj:
3415 c["continuation"] = obj["nextContinuationData"]
3416 return
e03b4f3e 3417
19f671f8 3418 def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page):
3419 search_response = self._parse_json(self._search_regex(self._SEARCH_DATA, page, 'ytInitialData'), None)
3420
e03b4f3e 3421 result_items = self._find_videos_in_json(search_response)
19f671f8 3422
955c4cb6 3423 for renderer in result_items:
3424 video_id = try_get(renderer, lambda x: x['videoId'])
3425 video_title = try_get(renderer, lambda x: x['title']['runs'][0]['text']) or try_get(renderer, lambda x: x['title']['simpleText'])
19f671f8 3426
3427 if video_id is None or video_title is None:
955c4cb6 3428 # we do not have a videoRenderer or title extraction broke
19f671f8 3429 continue
3430
3431 video_title = video_title.strip()
3432
3433 try:
3434 idx = ids_in_page.index(video_id)
3435 if video_title and not titles_in_page[idx]:
3436 titles_in_page[idx] = video_title
3437 except ValueError:
3438 ids_in_page.append(video_id)
3439 titles_in_page.append(video_title)
3440
3441 def extract_videos_from_page(self, page):
3442 ids_in_page = []
3443 titles_in_page = []
3444 self.extract_videos_from_page_impl(page, ids_in_page, titles_in_page)
3445 return zip(ids_in_page, titles_in_page)
3446
c9ae7b95
PH
3447 def _real_extract(self, url):
3448 mobj = re.match(self._VALID_URL, url)
7fd002c0 3449 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
c9ae7b95 3450 webpage = self._download_webpage(url, query)
0f8566e9
U
3451 # data_json = self._process_initial_data(webpage)
3452 return self.playlist_result(self._entries(webpage, query, n=5), playlist_title=query)
c9ae7b95
PH
3453
3454
136dadde 3455class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
78caa52a 3456 IE_DESC = 'YouTube.com (multi-season) shows'
92519402 3457 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 3458 IE_NAME = 'youtube:show'
cdc628a4 3459 _TESTS = [{
4003bd82 3460 'url': 'https://www.youtube.com/show/airdisasters',
8801255d 3461 'playlist_mincount': 5,
cdc628a4
PH
3462 'info_dict': {
3463 'id': 'airdisasters',
3464 'title': 'Air Disasters',
3465 }
3466 }]
75dff0ee
JMF
3467
3468 def _real_extract(self, url):
136dadde
S
3469 playlist_id = self._match_id(url)
3470 return super(YoutubeShowIE, self)._real_extract(
3471 'https://www.youtube.com/show/%s/playlists' % playlist_id)
04cc9617
JMF
3472
3473
9833e7a0 3474class YoutubeFeedsInfoExtractor(YoutubePlaylistBaseInfoExtractor):
d7ae0639 3475 """
25f14e9f 3476 Base class for feed extractors
d7ae0639
JMF
3477 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
3478 """
b2e8bc1b 3479 _LOGIN_REQUIRED = True
d7ae0639
JMF
3480
3481 @property
3482 def IE_NAME(self):
78caa52a 3483 return 'youtube:%s' % self._FEED_NAME
04cc9617 3484
81f0259b 3485 def _real_initialize(self):
b2e8bc1b 3486 self._login()
81f0259b 3487
9833e7a0
LR
3488 def _process_entries(self, entries, seen):
3489 new_info = []
3490 for v in entries:
3491 v_id = try_get(v, lambda x: x['videoId'])
3492 if not v_id:
3493 continue
62c95fd5 3494
9833e7a0
LR
3495 have_video = False
3496 for old in seen:
3497 if old['videoId'] == v_id:
3498 have_video = True
3499 break
2bc43303 3500
9833e7a0
LR
3501 if not have_video:
3502 new_info.append(v)
3853309f 3503
9833e7a0
LR
3504 if not new_info:
3505 return
2bc43303 3506
9833e7a0
LR
3507 seen.extend(new_info)
3508 for video in new_info:
3509 yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=self._extract_title(video))
2bc43303 3510
3853309f
S
3511 def _real_extract(self, url):
3512 page = self._download_webpage(
3513 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
3514 self._PLAYLIST_TITLE)
9833e7a0
LR
3515 return self.playlist_result(self._entries(page, self._PLAYLIST_TITLE),
3516 playlist_title=self._PLAYLIST_TITLE)
25f14e9f
S
3517
3518
3519class YoutubeWatchLaterIE(YoutubePlaylistIE):
3520 IE_NAME = 'youtube:watchlater'
3521 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
92519402 3522 _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'
25f14e9f 3523
bc7a9cd8
S
3524 _TESTS = [{
3525 'url': 'https://www.youtube.com/playlist?list=WL',
3526 'only_matching': True,
3527 }, {
3528 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',
3529 'only_matching': True,
3530 }]
25f14e9f
S
3531
3532 def _real_extract(self, url):
7e5dc339 3533 _, video = self._check_download_just_video(url, 'WL')
ebf1b291
S
3534 if video:
3535 return video
dacb3a86
S
3536 _, playlist = self._extract_playlist('WL')
3537 return playlist
f459d170 3538
5f6a1245 3539
c626a3d9 3540class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 3541 IE_NAME = 'youtube:favorites'
f3a34072 3542 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
92519402 3543 _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
3544 _LOGIN_REQUIRED = True
3545
3546 def _real_extract(self, url):
3547 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 3548 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 3549 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
3550
3551
25f14e9f
S
3552class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3553 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
92519402 3554 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'
25f14e9f
S
3555 _FEED_NAME = 'recommended'
3556 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1ed5b5c9 3557
1ed5b5c9 3558
25f14e9f
S
3559class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
3560 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
92519402 3561 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
25f14e9f
S
3562 _FEED_NAME = 'subscriptions'
3563 _PLAYLIST_TITLE = 'Youtube Subscriptions'
1ed5b5c9 3564
1ed5b5c9 3565
25f14e9f
S
3566class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
3567 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
92519402 3568 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
25f14e9f
S
3569 _FEED_NAME = 'history'
3570 _PLAYLIST_TITLE = 'Youtube History'
1ed5b5c9
JMF
3571
3572
15870e90
PH
3573class YoutubeTruncatedURLIE(InfoExtractor):
3574 IE_NAME = 'youtube:truncated_url'
3575 IE_DESC = False # Do not list
975d35db 3576 _VALID_URL = r'''(?x)
b95aab84
PH
3577 (?:https?://)?
3578 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3579 (?:watch\?(?:
c4808c60 3580 feature=[a-z_]+|
b95aab84
PH
3581 annotation_id=annotation_[^&]+|
3582 x-yt-cl=[0-9]+|
c1708b89 3583 hl=[^&]*|
287be8c6 3584 t=[0-9]+
b95aab84
PH
3585 )?
3586 |
3587 attribution_link\?a=[^&]+
3588 )
3589 $
975d35db 3590 '''
15870e90 3591
c4808c60 3592 _TESTS = [{
2d3d2997 3593 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
c4808c60 3594 'only_matching': True,
dc2fc736 3595 }, {
2d3d2997 3596 'url': 'https://www.youtube.com/watch?',
dc2fc736 3597 'only_matching': True,
b95aab84
PH
3598 }, {
3599 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3600 'only_matching': True,
3601 }, {
3602 'url': 'https://www.youtube.com/watch?feature=foo',
3603 'only_matching': True,
c1708b89
PH
3604 }, {
3605 'url': 'https://www.youtube.com/watch?hl=en-GB',
3606 'only_matching': True,
287be8c6
PH
3607 }, {
3608 'url': 'https://www.youtube.com/watch?t=2372',
3609 'only_matching': True,
c4808c60
PH
3610 }]
3611
15870e90
PH
3612 def _real_extract(self, url):
3613 raise ExtractorError(
78caa52a
PH
3614 'Did you forget to quote the URL? Remember that & is a meta '
3615 'character in most shells, so you want to put the URL in quotes, '
3867038a 3616 'like youtube-dl '
2d3d2997 3617 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3867038a 3618 ' or simply youtube-dl BaW_jenozKc .',
15870e90 3619 expected=True)
772fd5cc
PH
3620
3621
3622class YoutubeTruncatedIDIE(InfoExtractor):
3623 IE_NAME = 'youtube:truncated_id'
3624 IE_DESC = False # Do not list
b95aab84 3625 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
3626
3627 _TESTS = [{
3628 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3629 'only_matching': True,
3630 }]
3631
3632 def _real_extract(self, url):
3633 video_id = self._match_id(url)
3634 raise ExtractorError(
3635 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3636 expected=True)