]> jfr.im git - yt-dlp.git/blame - youtube_dlc/extractor/youtube.py
Merge branch 'master' into youtube-playlist-polymer
[yt-dlp.git] / youtube_dlc / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
d77ab8e2 9import random
c5e8d7af 10import re
42939b61 11import time
e0df6211 12import traceback
c5e8d7af 13
b05654f0 14from .common import InfoExtractor, SearchInfoExtractor
2b25cb5d 15from ..jsinterp import JSInterpreter
54256267 16from ..swfinterp import SWFInterpreter
4bb4a188 17from ..compat import (
edf3e38e 18 compat_chr,
f8c55c66 19 compat_HTTPError,
8d81f3e3 20 compat_kwargs,
c5e8d7af 21 compat_parse_qs,
7fd002c0
S
22 compat_urllib_parse_unquote,
23 compat_urllib_parse_unquote_plus,
15707c7e 24 compat_urllib_parse_urlencode,
7c80519c 25 compat_urllib_parse_urlparse,
7c61bd36 26 compat_urlparse,
c5e8d7af 27 compat_str,
4bb4a188
PH
28)
29from ..utils import (
27019dbb 30 bool_or_none,
c5e8d7af 31 clean_html,
9b9c5355 32 error_to_compat_str,
351f37c0 33 extract_attributes,
c5e8d7af 34 ExtractorError,
2d30521a 35 float_or_none,
4bb4a188
PH
36 get_element_by_attribute,
37 get_element_by_id,
dd27fd17 38 int_or_none,
9833e7a0 39 js_to_json,
94278f72 40 mimetype2ext,
4bb4a188 41 orderedSet,
6310acf5 42 parse_codecs,
b84071c0 43 parse_count,
7c80519c 44 parse_duration,
0cb58b02 45 remove_quotes,
3995d37d 46 remove_start,
cf7e015f 47 smuggle_url,
dbdaaa23 48 str_or_none,
c93d53f5 49 str_to_int,
556dbe7f 50 try_get,
c5e8d7af
PH
51 unescapeHTML,
52 unified_strdate,
cf7e015f 53 unsmuggle_url,
81c2f20b 54 uppercase_escape,
21c340b8 55 url_or_none,
6e6bc8da 56 urlencode_postdata,
c5e8d7af
PH
57)
58
5f6a1245 59
de7f3446 60class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
61 """Provide base functions for Youtube extractors"""
62 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
9303ce3e 63 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
e00eb564
S
64
65 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
3995d37d
S
66 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
67 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
e00eb564 68
b2e8bc1b
JMF
69 _NETRC_MACHINE = 'youtube'
70 # If True it will raise an error if no login info is provided
71 _LOGIN_REQUIRED = False
72
66b48727 73 _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}'
9833e7a0
LR
74 _INITIAL_DATA_RE = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});'
75 _YTCFG_DATA_RE = r"ytcfg.set\(({.*?})\)"
d0ba5587 76
d84b21b4
S
77 _YOUTUBE_CLIENT_HEADERS = {
78 'x-youtube-client-name': '1',
79 'x-youtube-client-version': '1.20200609.04.02',
80 }
81
b2e8bc1b 82 def _set_language(self):
810fb84d 83 self._set_cookie(
ee0b726c 84 '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
42939b61 85 # YouTube sets the expire time to about two months
810fb84d 86 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b 87
25f14e9f
S
88 def _ids_to_results(self, ids):
89 return [
90 self.url_result(vid_id, 'Youtube', video_id=vid_id)
91 for vid_id in ids]
92
b2e8bc1b 93 def _login(self):
83317f69 94 """
95 Attempt to log in to YouTube.
96 True is returned if successful or skipped.
97 False is returned if login failed.
98
99 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
100 """
68217024 101 username, password = self._get_login_info()
b2e8bc1b
JMF
102 # No authentication to be performed
103 if username is None:
70d35d16 104 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
69ea8ca4 105 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
957c523e
U
106 if self._downloader.params.get('cookiefile') and False: # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
107 self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
83317f69 108 return True
b2e8bc1b 109
7cc3570e
PH
110 login_page = self._download_webpage(
111 self._LOGIN_URL, None,
69ea8ca4
PH
112 note='Downloading login page',
113 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
114 if login_page is False:
115 return
b2e8bc1b 116
1212e997 117 login_form = self._hidden_inputs(login_page)
c5e8d7af 118
e00eb564
S
119 def req(url, f_req, note, errnote):
120 data = login_form.copy()
121 data.update({
122 'pstMsg': 1,
123 'checkConnection': 'youtube',
124 'checkedDomains': 'youtube',
125 'hl': 'en',
126 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
3995d37d 127 'f.req': json.dumps(f_req),
e00eb564
S
128 'flowName': 'GlifWebSignIn',
129 'flowEntry': 'ServiceLogin',
baf67a60
S
130 # TODO: reverse actual botguard identifier generation algo
131 'bgRequest': '["identifier",""]',
041bc3ad 132 })
e00eb564
S
133 return self._download_json(
134 url, None, note=note, errnote=errnote,
135 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
136 fatal=False,
137 data=urlencode_postdata(data), headers={
138 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
139 'Google-Accounts-XSRF': 1,
140 })
141
3995d37d
S
142 def warn(message):
143 self._downloader.report_warning(message)
144
145 lookup_req = [
146 username,
147 None, [], None, 'US', None, None, 2, False, True,
148 [
149 None, None,
150 [2, 1, None, 1,
151 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
152 None, [], 4],
153 1, [None, None, []], None, None, None, True
154 ],
155 username,
156 ]
157
e00eb564 158 lookup_results = req(
3995d37d 159 self._LOOKUP_URL, lookup_req,
e00eb564
S
160 'Looking up account info', 'Unable to look up account info')
161
162 if lookup_results is False:
163 return False
041bc3ad 164
3995d37d
S
165 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
166 if not user_hash:
167 warn('Unable to extract user hash')
168 return False
169
170 challenge_req = [
171 user_hash,
172 None, 1, None, [1, None, None, None, [password, None, True]],
173 [
174 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
175 1, [None, None, []], None, None, None, True
176 ]]
83317f69 177
3995d37d
S
178 challenge_results = req(
179 self._CHALLENGE_URL, challenge_req,
180 'Logging in', 'Unable to log in')
83317f69 181
3995d37d 182 if challenge_results is False:
e00eb564 183 return
83317f69 184
3995d37d
S
185 login_res = try_get(challenge_results, lambda x: x[0][5], list)
186 if login_res:
187 login_msg = try_get(login_res, lambda x: x[5], compat_str)
188 warn(
189 'Unable to login: %s' % 'Invalid password'
190 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
191 return False
192
193 res = try_get(challenge_results, lambda x: x[0][-1], list)
194 if not res:
195 warn('Unable to extract result entry')
196 return False
197
9a6628aa
S
198 login_challenge = try_get(res, lambda x: x[0][0], list)
199 if login_challenge:
200 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
201 if challenge_str == 'TWO_STEP_VERIFICATION':
3995d37d
S
202 # SEND_SUCCESS - TFA code has been successfully sent to phone
203 # QUOTA_EXCEEDED - reached the limit of TFA codes
9a6628aa 204 status = try_get(login_challenge, lambda x: x[5], compat_str)
3995d37d
S
205 if status == 'QUOTA_EXCEEDED':
206 warn('Exceeded the limit of TFA codes, try later')
207 return False
208
209 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
210 if not tl:
211 warn('Unable to extract TL')
212 return False
213
214 tfa_code = self._get_tfa_info('2-step verification code')
215
216 if not tfa_code:
217 warn(
218 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
219 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
220 return False
221
222 tfa_code = remove_start(tfa_code, 'G-')
223
224 tfa_req = [
225 user_hash, None, 2, None,
226 [
227 9, None, None, None, None, None, None, None,
228 [None, tfa_code, True, 2]
229 ]]
230
231 tfa_results = req(
232 self._TFA_URL.format(tl), tfa_req,
233 'Submitting TFA code', 'Unable to submit TFA code')
234
235 if tfa_results is False:
236 return False
237
238 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
239 if tfa_res:
240 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
241 warn(
242 'Unable to finish TFA: %s' % 'Invalid TFA code'
243 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
244 return False
245
246 check_cookie_url = try_get(
247 tfa_results, lambda x: x[0][-1][2], compat_str)
9a6628aa
S
248 else:
249 CHALLENGES = {
250 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
251 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
252 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
253 }
254 challenge = CHALLENGES.get(
255 challenge_str,
256 '%s returned error %s.' % (self.IE_NAME, challenge_str))
257 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
258 return False
3995d37d
S
259 else:
260 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
261
262 if not check_cookie_url:
263 warn('Unable to extract CheckCookie URL')
264 return False
e00eb564
S
265
266 check_cookie_results = self._download_webpage(
3995d37d
S
267 check_cookie_url, None, 'Checking cookie', fatal=False)
268
269 if check_cookie_results is False:
270 return False
e00eb564 271
3995d37d
S
272 if 'https://myaccount.google.com/' not in check_cookie_results:
273 warn('Unable to log in')
b2e8bc1b 274 return False
e00eb564 275
b2e8bc1b
JMF
276 return True
277
30226342 278 def _download_webpage_handle(self, *args, **kwargs):
c1148516 279 query = kwargs.get('query', {}).copy()
c1148516 280 kwargs['query'] = query
30226342 281 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
8d81f3e3
S
282 *args, **compat_kwargs(kwargs))
283
5b0a6a80 284 def _get_yt_initial_data(self, video_id, webpage):
285 config = self._search_regex(
286 (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
287 r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
288 webpage, 'ytInitialData', default=None)
289 if config:
290 return self._parse_json(
291 uppercase_escape(config), video_id, fatal=False)
292
b2e8bc1b
JMF
293 def _real_initialize(self):
294 if self._downloader is None:
295 return
42939b61 296 self._set_language()
b2e8bc1b
JMF
297 if not self._login():
298 return
c5e8d7af 299
8377574c 300
8e7aad20 301class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
c54f4aad 302
9833e7a0
LR
303 def _find_entries_in_json(self, extracted):
304 entries = []
305 c = {}
306
307 def _real_find(obj):
308 if obj is None or isinstance(obj, str):
309 return
310
311 if type(obj) is list:
312 for elem in obj:
313 _real_find(elem)
314
315 if type(obj) is dict:
316 if self._is_entry(obj):
317 entries.append(obj)
318 return
319
320 if 'continuationCommand' in obj:
321 c['continuation'] = obj
322 return
323
324 for _, o in obj.items():
325 _real_find(o)
326
327 _real_find(extracted)
328
329 return entries, try_get(c, lambda x: x["continuation"])
330
0f8566e9 331 def _entries(self, page, playlist_id, n=1):
9833e7a0
LR
332 seen = []
333
334 yt_conf = {}
335 for m in re.finditer(self._YTCFG_DATA_RE, page):
336 parsed = self._parse_json(m.group(1), playlist_id,
337 transform_source=js_to_json, fatal=False)
338 if parsed:
339 yt_conf.update(parsed)
340
341 data_json = self._parse_json(self._search_regex(self._INITIAL_DATA_RE, page, 'ytInitialData'), None)
342
0f8566e9
U
343 # for page_num in itertools.count(1):
344 for page_num in range(n):
9833e7a0
LR
345 entries, continuation = self._find_entries_in_json(data_json)
346 processed = self._process_entries(entries, seen)
347
348 if not processed:
349 break
350 for entry in processed:
061a75ed 351 yield entry
648e6a1f 352
9833e7a0
LR
353 if not continuation or not yt_conf:
354 break
355 continuation_token = try_get(continuation, lambda x: x['continuationCommand']['token'])
356 continuation_url = try_get(continuation, lambda x: x['commandMetadata']['webCommandMetadata']['apiUrl'])
357 if not continuation_token or not continuation_url:
648e6a1f
S
358 break
359
f8c55c66
S
360 count = 0
361 retries = 3
362 while count <= retries:
363 try:
364 # Downloading page may result in intermittent 5xx HTTP error
365 # that is usually worked around with a retry
9833e7a0
LR
366 data_json = self._download_json(
367 'https://www.youtube.com%s' % continuation_url,
368 playlist_id,
369 'Downloading page #%s%s' % (page_num, ' (retry #%d)' % count if count else ''),
c54f4aad 370
d84b21b4 371 transform_source=uppercase_escape,
9833e7a0
LR
372 query={
373 'key': try_get(yt_conf, lambda x: x['INNERTUBE_API_KEY'])
374 },
375 data=bytes(json.dumps({
376 'context': try_get(yt_conf, lambda x: x['INNERTUBE_CONTEXT']),
377 'continuation': continuation_token
378 }), encoding='utf-8'),
379 headers={
380 'Content-Type': 'application/json'
381 }
382 )
f8c55c66
S
383 break
384 except ExtractorError as e:
385 if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
386 count += 1
387 if count <= retries:
388 continue
389 raise
390
9833e7a0
LR
391 def _extract_title(self, renderer):
392 title = try_get(renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
393 if title:
394 return title
395 return try_get(renderer, lambda x: x['title']['simpleText'], compat_str)
648e6a1f 396
061a75ed
S
397
398class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
9833e7a0
LR
399 def _is_entry(self, obj):
400 return 'videoId' in obj
401
402 def _process_entries(self, entries, seen):
403 ids_in_page = []
404 titles_in_page = []
405 for renderer in entries:
406 video_id = try_get(renderer, lambda x: x['videoId'])
407 video_title = self._extract_title(renderer)
061a75ed 408
9833e7a0
LR
409 if video_id is None or video_title is None:
410 # we do not have a videoRenderer or title extraction broke
648e6a1f 411 continue
9833e7a0
LR
412
413 video_title = video_title.strip()
414
648e6a1f
S
415 try:
416 idx = ids_in_page.index(video_id)
417 if video_title and not titles_in_page[idx]:
418 titles_in_page[idx] = video_title
419 except ValueError:
420 ids_in_page.append(video_id)
421 titles_in_page.append(video_title)
351f37c0 422
9833e7a0
LR
423 for video_id, video_title in zip(ids_in_page, titles_in_page):
424 yield self.url_result(video_id, 'Youtube', video_id, video_title)
648e6a1f
S
425
426
061a75ed 427class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
9833e7a0
LR
428 def _is_entry(self, obj):
429 return 'playlistId' in obj
430
431 def _process_entries(self, entries, seen):
432 for playlist_id in orderedSet(try_get(r, lambda x: x['playlistId']) for r in entries):
c54f4aad 433
061a75ed
S
434 yield self.url_result(
435 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
436
0c148415
S
437 def _real_extract(self, url):
438 playlist_id = self._match_id(url)
439 webpage = self._download_webpage(url, playlist_id)
0c148415 440 title = self._og_search_title(webpage, fatal=False)
061a75ed 441 return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
0c148415
S
442
443
360e1ca5 444class YoutubeIE(YoutubeBaseInfoExtractor):
78caa52a 445 IE_DESC = 'YouTube.com'
cb7dfeea 446 _VALID_URL = r"""(?x)^
c5e8d7af 447 (
edb53e2d 448 (?:https?://|//) # http(s):// or protocol-independent URL
66b48727 449 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
484aaeb2 450 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 451 (?:www\.)?pwnyoutube\.com/|
8b561bfc 452 (?:www\.)?hooktube\.com/|
f7000f3a 453 (?:www\.)?yourepeat\.com/|
e69ae5b9 454 tube\.majestyc\.net/|
ba036333 455 # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
77d95677 456 (?:(?:www|dev)\.)?invidio\.us/|
ba036333 457 (?:(?:www|no)\.)?invidiou\.sh/|
458 (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/|
8ae113ca 459 (?:www\.)?invidious\.kabi\.tk/|
ba036333 460 (?:www\.)?invidious\.13ad\.de/|
791d2e81 461 (?:www\.)?invidious\.mastodon\.host/|
494d664e 462 (?:www\.)?invidious\.nixnet\.xyz/|
666d808e 463 (?:www\.)?invidious\.drycat\.fr/|
ba036333 464 (?:www\.)?tube\.poal\.co/|
8ae113ca 465 (?:www\.)?vid\.wxzm\.sx/|
384bf91f 466 (?:www\.)?yewtu\.be/|
494d664e 467 (?:www\.)?yt\.elukerio\.org/|
894b3826 468 (?:www\.)?yt\.lelux\.fi/|
1db5ab6b 469 (?:www\.)?invidious\.ggc-project\.de/|
470 (?:www\.)?yt\.maisputain\.ovh/|
471 (?:www\.)?invidious\.13ad\.de/|
472 (?:www\.)?invidious\.toot\.koeln/|
473 (?:www\.)?invidious\.fdn\.fr/|
474 (?:www\.)?watch\.nettohikari\.com/|
bff90fc5 475 (?:www\.)?kgg2m7yk5aybusll\.onion/|
476 (?:www\.)?qklhadlycap4cnod\.onion/|
477 (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
478 (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
479 (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
480 (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
33c1c7d8 481 (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
1db5ab6b 482 (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
e69ae5b9 483 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
484 (?:.*?\#/)? # handle anchor (#/) redirect urls
485 (?: # the various things that can precede the ID:
ac7553d0 486 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 487 |(?: # or the v= param in all its forms
f7000f3a 488 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af 489 (?:\?|\#!?) # the params delimiter ? or # or #!
040ac686 490 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
c5e8d7af
PH
491 v=
492 )
f4b05232 493 ))
cbaed4bb
S
494 |(?:
495 youtu\.be| # just youtu.be/xxxx
6d4fc66b
S
496 vid\.plus| # or vid.plus/xxxx
497 zwearz\.com/watch| # or zwearz.com/watch/xxxx
cbaed4bb 498 )/
edb53e2d 499 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 500 )
c5e8d7af 501 )? # all until now is optional -> you can pass the naked ID
8963d9c2 502 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
d0ba5587
S
503 (?!.*?\blist=
504 (?:
505 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
506 WL # WL are handled by the watch later IE
507 )
508 )
c5e8d7af 509 (?(1).+)? # if we found the ID, everything can follow
d0ba5587 510 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
c5e8d7af 511 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
e40c758c
S
512 _PLAYER_INFO_RE = (
513 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P<ext>[a-z]+)$',
514 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.(?P<ext>[a-z]+)$',
515 )
2c62dc26 516 _formats = {
c2d3cb4c 517 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
518 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
519 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
520 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
521 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
522 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
523 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
524 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
3834d3e3 525 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
c2d3cb4c 526 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
527 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
528 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
529 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
530 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
531 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
e1a0bfdf 532 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
c2d3cb4c 533 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
534 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
e1a0bfdf 535
536
537 # 3D videos
c2d3cb4c 538 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
539 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
540 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
541 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
e1a0bfdf 542 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
543 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
544 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
836a086c 545
96fb5605 546 # Apple HTTP Live Streaming
11f12195 547 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
c2d3cb4c 548 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
549 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
550 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
551 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
552 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
e1a0bfdf 553 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
554 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
2c62dc26
PH
555
556 # DASH mp4 video
d23028a8
S
557 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
558 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
559 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
560 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
561 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
067aa17e 562 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
d23028a8
S
563 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
564 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
565 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
566 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
567 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
568 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
836a086c 569
f6f1fc92 570 # Dash mp4 audio
d23028a8
S
571 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
572 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
573 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
574 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
575 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
576 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
577 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
836a086c
AZ
578
579 # Dash webm
d23028a8
S
580 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
581 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
582 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
583 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
584 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
585 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
586 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
587 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
588 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
589 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
590 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
591 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
592 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
593 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
594 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
4c6b4764 595 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
d23028a8
S
596 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
597 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
598 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
599 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
600 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
601 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
2c62dc26
PH
602
603 # Dash webm audio
d23028a8
S
604 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
605 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
ce6b9a2d 606
0857baad 607 # Dash webm audio with opus inside
d23028a8
S
608 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
609 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
610 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
0857baad 611
ce6b9a2d
PH
612 # RTMP (unnamed)
613 '_rtmp': {'protocol': 'rtmp'},
b85eae0f
S
614
615 # av01 video only formats sometimes served with "unknown" codecs
616 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
617 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
618 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
619 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
c5e8d7af 620 }
84da5d84 621 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
836a086c 622
fd5c4aab
S
623 _GEO_BYPASS = False
624
78caa52a 625 IE_NAME = 'youtube'
2eb88d95
PH
626 _TESTS = [
627 {
2d3d2997 628 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
4bc3a23e
PH
629 'info_dict': {
630 'id': 'BaW_jenozKc',
631 'ext': 'mp4',
3867038a 632 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
4bc3a23e
PH
633 'uploader': 'Philipp Hagemeister',
634 'uploader_id': 'phihag',
ec85ded8 635 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
dd4c4492
S
636 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
637 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
4bc3a23e 638 'upload_date': '20121002',
3867038a 639 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
4bc3a23e 640 'categories': ['Science & Technology'],
3867038a 641 'tags': ['youtube-dl'],
556dbe7f 642 'duration': 10,
dbdaaa23 643 'view_count': int,
3e7c1224
PH
644 'like_count': int,
645 'dislike_count': int,
7c80519c 646 'start_time': 1,
297a564b 647 'end_time': 9,
2eb88d95 648 }
0e853ca4 649 },
fccd3771 650 {
4bc3a23e
PH
651 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
652 'note': 'Embed-only video (#1746)',
653 'info_dict': {
654 'id': 'yZIXLfi8CZQ',
655 'ext': 'mp4',
656 'upload_date': '20120608',
657 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
658 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
659 'uploader': 'SET India',
94bfcd23 660 'uploader_id': 'setindia',
ec85ded8 661 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
94bfcd23 662 'age_limit': 18,
fccd3771
PH
663 }
664 },
11b56058 665 {
2d3d2997 666 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
11b56058
PM
667 'note': 'Use the first video ID in the URL',
668 'info_dict': {
669 'id': 'BaW_jenozKc',
670 'ext': 'mp4',
3867038a 671 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
11b56058
PM
672 'uploader': 'Philipp Hagemeister',
673 'uploader_id': 'phihag',
ec85ded8 674 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
11b56058 675 'upload_date': '20121002',
3867038a 676 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
11b56058 677 'categories': ['Science & Technology'],
3867038a 678 'tags': ['youtube-dl'],
556dbe7f 679 'duration': 10,
dbdaaa23 680 'view_count': int,
11b56058
PM
681 'like_count': int,
682 'dislike_count': int,
34a7de29
S
683 },
684 'params': {
685 'skip_download': True,
686 },
11b56058 687 },
dd27fd17 688 {
2d3d2997 689 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
4bc3a23e
PH
690 'note': '256k DASH audio (format 141) via DASH manifest',
691 'info_dict': {
692 'id': 'a9LDPn-MO4I',
693 'ext': 'm4a',
694 'upload_date': '20121002',
695 'uploader_id': '8KVIDEO',
ec85ded8 696 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
4bc3a23e
PH
697 'description': '',
698 'uploader': '8KVIDEO',
699 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 700 },
4bc3a23e
PH
701 'params': {
702 'youtube_include_dash_manifest': True,
703 'format': '141',
4919603f 704 },
de3c7fe0 705 'skip': 'format 141 not served anymore',
dd27fd17 706 },
aa79ac0c
PH
707 # Controversy video
708 {
709 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
710 'info_dict': {
711 'id': 'T4XJQO3qol8',
712 'ext': 'mp4',
556dbe7f 713 'duration': 219,
aa79ac0c 714 'upload_date': '20100909',
4fe54c12 715 'uploader': 'Amazing Atheist',
aa79ac0c 716 'uploader_id': 'TheAmazingAtheist',
ec85ded8 717 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
aa79ac0c
PH
718 'title': 'Burning Everyone\'s Koran',
719 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
720 }
c522adb1 721 },
dd2d55f1 722 # Normal age-gate video (embed allowed)
c522adb1 723 {
2d3d2997 724 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
c522adb1
JMF
725 'info_dict': {
726 'id': 'HtVdAasjOgU',
727 'ext': 'mp4',
728 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
ec85ded8 729 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
556dbe7f 730 'duration': 142,
c522adb1
JMF
731 'uploader': 'The Witcher',
732 'uploader_id': 'WitcherGame',
ec85ded8 733 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
c522adb1 734 'upload_date': '20140605',
34952f09 735 'age_limit': 18,
c522adb1
JMF
736 },
737 },
067aa17e 738 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
e52a40ab
PH
739 {
740 'url': 'lqQg6PlCWgI',
741 'info_dict': {
742 'id': 'lqQg6PlCWgI',
743 'ext': 'mp4',
556dbe7f 744 'duration': 6085,
90227264 745 'upload_date': '20150827',
cbe2bd91 746 'uploader_id': 'olympic',
ec85ded8 747 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
cbe2bd91 748 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
be49068d 749 'uploader': 'Olympic',
cbe2bd91
PH
750 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
751 },
752 'params': {
753 'skip_download': 'requires avconv',
e52a40ab 754 }
cbe2bd91 755 },
6271f1ca
PH
756 # Non-square pixels
757 {
758 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
759 'info_dict': {
760 'id': '_b-2C3KPAM0',
761 'ext': 'mp4',
762 'stretched_ratio': 16 / 9.,
556dbe7f 763 'duration': 85,
6271f1ca
PH
764 'upload_date': '20110310',
765 'uploader_id': 'AllenMeow',
ec85ded8 766 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
6271f1ca 767 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
eb6793ba 768 'uploader': '孫ᄋᄅ',
6271f1ca
PH
769 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
770 },
06b491eb
S
771 },
772 # url_encoded_fmt_stream_map is empty string
773 {
774 'url': 'qEJwOuvDf7I',
775 'info_dict': {
776 'id': 'qEJwOuvDf7I',
f57b7835 777 'ext': 'webm',
06b491eb
S
778 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
779 'description': '',
780 'upload_date': '20150404',
781 'uploader_id': 'spbelect',
782 'uploader': 'Наблюдатели Петербурга',
783 },
784 'params': {
785 'skip_download': 'requires avconv',
e323cf3f
S
786 },
787 'skip': 'This live event has ended.',
06b491eb 788 },
067aa17e 789 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
da77d856
S
790 {
791 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
792 'info_dict': {
793 'id': 'FIl7x6_3R5Y',
eb6793ba 794 'ext': 'webm',
da77d856
S
795 'title': 'md5:7b81415841e02ecd4313668cde88737a',
796 'description': 'md5:116377fd2963b81ec4ce64b542173306',
556dbe7f 797 'duration': 220,
da77d856
S
798 'upload_date': '20150625',
799 'uploader_id': 'dorappi2000',
ec85ded8 800 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
da77d856 801 'uploader': 'dorappi2000',
eb6793ba 802 'formats': 'mincount:31',
da77d856 803 },
eb6793ba 804 'skip': 'not actual anymore',
2ee8f5d8 805 },
8a1a26ce
YCH
806 # DASH manifest with segment_list
807 {
808 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
809 'md5': '8ce563a1d667b599d21064e982ab9e31',
810 'info_dict': {
811 'id': 'CsmdDsKjzN8',
812 'ext': 'mp4',
17ee98e1 813 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
8a1a26ce
YCH
814 'uploader': 'Airtek',
815 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
816 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
817 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
818 },
819 'params': {
820 'youtube_include_dash_manifest': True,
821 'format': '135', # bestvideo
be49068d
S
822 },
823 'skip': 'This live event has ended.',
2ee8f5d8 824 },
cf7e015f
S
825 {
826 # Multifeed videos (multiple cameras), URL is for Main Camera
827 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
828 'info_dict': {
829 'id': 'jqWvoWXjCVs',
830 'title': 'teamPGP: Rocket League Noob Stream',
831 'description': 'md5:dc7872fb300e143831327f1bae3af010',
832 },
833 'playlist': [{
834 'info_dict': {
835 'id': 'jqWvoWXjCVs',
836 'ext': 'mp4',
837 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
838 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 839 'duration': 7335,
cf7e015f
S
840 'upload_date': '20150721',
841 'uploader': 'Beer Games Beer',
842 'uploader_id': 'beergamesbeer',
ec85ded8 843 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 844 'license': 'Standard YouTube License',
cf7e015f
S
845 },
846 }, {
847 'info_dict': {
848 'id': '6h8e8xoXJzg',
849 'ext': 'mp4',
850 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
851 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 852 'duration': 7337,
cf7e015f
S
853 'upload_date': '20150721',
854 'uploader': 'Beer Games Beer',
855 'uploader_id': 'beergamesbeer',
ec85ded8 856 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 857 'license': 'Standard YouTube License',
cf7e015f
S
858 },
859 }, {
860 'info_dict': {
861 'id': 'PUOgX5z9xZw',
862 'ext': 'mp4',
863 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
864 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 865 'duration': 7337,
cf7e015f
S
866 'upload_date': '20150721',
867 'uploader': 'Beer Games Beer',
868 'uploader_id': 'beergamesbeer',
ec85ded8 869 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 870 'license': 'Standard YouTube License',
cf7e015f
S
871 },
872 }, {
873 'info_dict': {
874 'id': 'teuwxikvS5k',
875 'ext': 'mp4',
876 'title': 'teamPGP: Rocket League Noob Stream (zim)',
877 'description': 'md5:dc7872fb300e143831327f1bae3af010',
556dbe7f 878 'duration': 7334,
cf7e015f
S
879 'upload_date': '20150721',
880 'uploader': 'Beer Games Beer',
881 'uploader_id': 'beergamesbeer',
ec85ded8 882 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
7caf9830 883 'license': 'Standard YouTube License',
cf7e015f
S
884 },
885 }],
886 'params': {
887 'skip_download': True,
888 },
4fe54c12 889 'skip': 'This video is not available.',
cbaed4bb 890 },
f9f49d87 891 {
067aa17e 892 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
f9f49d87
S
893 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
894 'info_dict': {
895 'id': 'gVfLd0zydlo',
896 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
897 },
898 'playlist_count': 2,
be49068d 899 'skip': 'Not multifeed anymore',
f9f49d87 900 },
cbaed4bb 901 {
2d3d2997 902 'url': 'https://vid.plus/FlRa-iH7PGw',
cbaed4bb 903 'only_matching': True,
0e49d9a6 904 },
6d4fc66b 905 {
2d3d2997 906 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
6d4fc66b
S
907 'only_matching': True,
908 },
0e49d9a6 909 {
067aa17e 910 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
a8776b10 911 # Also tests cut-off URL expansion in video description (see
067aa17e
S
912 # https://github.com/ytdl-org/youtube-dl/issues/1892,
913 # https://github.com/ytdl-org/youtube-dl/issues/8164)
0e49d9a6
LL
914 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
915 'info_dict': {
916 'id': 'lsguqyKfVQg',
917 'ext': 'mp4',
918 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
eb6793ba 919 'alt_title': 'Dark Walk - Position Music',
0e49d9a6 920 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
556dbe7f 921 'duration': 133,
0e49d9a6
LL
922 'upload_date': '20151119',
923 'uploader_id': 'IronSoulElf',
ec85ded8 924 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
0e49d9a6 925 'uploader': 'IronSoulElf',
eb6793ba
S
926 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
927 'track': 'Dark Walk - Position Music',
928 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
92bc97d3 929 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
0e49d9a6
LL
930 },
931 'params': {
932 'skip_download': True,
933 },
934 },
61f92af1 935 {
067aa17e 936 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
61f92af1
S
937 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
938 'only_matching': True,
939 },
313dfc45
LL
940 {
941 # Video with yt:stretch=17:0
942 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
943 'info_dict': {
944 'id': 'Q39EVAstoRM',
945 'ext': 'mp4',
946 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
947 'description': 'md5:ee18a25c350637c8faff806845bddee9',
948 'upload_date': '20151107',
949 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
950 'uploader': 'CH GAMER DROID',
951 },
952 'params': {
953 'skip_download': True,
954 },
be49068d 955 'skip': 'This video does not exist.',
313dfc45 956 },
7caf9830
S
957 {
958 # Video licensed under Creative Commons
959 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
960 'info_dict': {
961 'id': 'M4gD1WSo5mA',
962 'ext': 'mp4',
963 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
964 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
556dbe7f 965 'duration': 721,
7caf9830
S
966 'upload_date': '20150127',
967 'uploader_id': 'BerkmanCenter',
ec85ded8 968 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
556dbe7f 969 'uploader': 'The Berkman Klein Center for Internet & Society',
7caf9830
S
970 'license': 'Creative Commons Attribution license (reuse allowed)',
971 },
972 'params': {
973 'skip_download': True,
974 },
975 },
fd050249
S
976 {
977 # Channel-like uploader_url
978 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
979 'info_dict': {
980 'id': 'eQcmzGIKrzg',
981 'ext': 'mp4',
982 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
983 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
556dbe7f 984 'duration': 4060,
fd050249 985 'upload_date': '20151119',
eb6793ba 986 'uploader': 'Bernie Sanders',
fd050249 987 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
ec85ded8 988 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
fd050249
S
989 'license': 'Creative Commons Attribution license (reuse allowed)',
990 },
991 'params': {
992 'skip_download': True,
993 },
994 },
040ac686
S
995 {
996 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
997 'only_matching': True,
7f29cf54
S
998 },
999 {
067aa17e 1000 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
7f29cf54
S
1001 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
1002 'only_matching': True,
6496ccb4
S
1003 },
1004 {
1005 # Rental video preview
1006 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
1007 'info_dict': {
1008 'id': 'uGpuVWrhIzE',
1009 'ext': 'mp4',
1010 'title': 'Piku - Trailer',
1011 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
1012 'upload_date': '20150811',
1013 'uploader': 'FlixMatrix',
1014 'uploader_id': 'FlixMatrixKaravan',
ec85ded8 1015 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
6496ccb4
S
1016 'license': 'Standard YouTube License',
1017 },
1018 'params': {
1019 'skip_download': True,
1020 },
eb6793ba 1021 'skip': 'This video is not available.',
022a5d66 1022 },
12afdc2a
S
1023 {
1024 # YouTube Red video with episode data
1025 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
1026 'info_dict': {
1027 'id': 'iqKdEhx-dD4',
1028 'ext': 'mp4',
1029 'title': 'Isolation - Mind Field (Ep 1)',
4fe54c12 1030 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
556dbe7f 1031 'duration': 2085,
12afdc2a
S
1032 'upload_date': '20170118',
1033 'uploader': 'Vsauce',
1034 'uploader_id': 'Vsauce',
1035 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
12afdc2a
S
1036 'series': 'Mind Field',
1037 'season_number': 1,
1038 'episode_number': 1,
1039 },
1040 'params': {
1041 'skip_download': True,
1042 },
1043 'expected_warnings': [
1044 'Skipping DASH manifest',
1045 ],
1046 },
c7121fa7
S
1047 {
1048 # The following content has been identified by the YouTube community
1049 # as inappropriate or offensive to some audiences.
1050 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
1051 'info_dict': {
1052 'id': '6SJNVb0GnPI',
1053 'ext': 'mp4',
1054 'title': 'Race Differences in Intelligence',
1055 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
1056 'duration': 965,
1057 'upload_date': '20140124',
1058 'uploader': 'New Century Foundation',
1059 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
1060 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
c7121fa7
S
1061 },
1062 'params': {
1063 'skip_download': True,
1064 },
1065 },
022a5d66
S
1066 {
1067 # itag 212
1068 'url': '1t24XAntNCY',
1069 'only_matching': True,
fd5c4aab
S
1070 },
1071 {
1072 # geo restricted to JP
1073 'url': 'sJL6WA-aGkQ',
1074 'only_matching': True,
1075 },
d0ba5587
S
1076 {
1077 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
1078 'only_matching': True,
1079 },
cd5a74a2
S
1080 {
1081 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1082 'only_matching': True,
1083 },
825cd268
RA
1084 {
1085 # DRM protected
1086 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1087 'only_matching': True,
4fe54c12
S
1088 },
1089 {
1090 # Video with unsupported adaptive stream type formats
1091 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1092 'info_dict': {
1093 'id': 'Z4Vy8R84T1U',
1094 'ext': 'mp4',
1095 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1096 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1097 'duration': 433,
1098 'upload_date': '20130923',
1099 'uploader': 'Amelia Putri Harwita',
1100 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1101 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1102 'formats': 'maxcount:10',
1103 },
1104 'params': {
1105 'skip_download': True,
1106 'youtube_include_dash_manifest': False,
1107 },
5429d6a9 1108 'skip': 'not actual anymore',
5caabd3c 1109 },
1110 {
822b9d9c 1111 # Youtube Music Auto-generated description
5caabd3c 1112 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1113 'info_dict': {
1114 'id': 'MgNrAu2pzNs',
1115 'ext': 'mp4',
1116 'title': 'Voyeur Girl',
1117 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1118 'upload_date': '20190312',
5429d6a9
S
1119 'uploader': 'Stephen - Topic',
1120 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
5caabd3c 1121 'artist': 'Stephen',
1122 'track': 'Voyeur Girl',
1123 'album': 'it\'s too much love to know my dear',
1124 'release_date': '20190313',
1125 'release_year': 2019,
1126 },
1127 'params': {
1128 'skip_download': True,
1129 },
1130 },
1131 {
822b9d9c 1132 # Youtube Music Auto-generated description
5caabd3c 1133 # Retrieve 'artist' field from 'Artist:' in video description
1134 # when it is present on youtube music video
5caabd3c 1135 'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY',
1136 'info_dict': {
1137 'id': 'k0jLE7tTwjY',
1138 'ext': 'mp4',
1139 'title': 'Latch Feat. Sam Smith',
1140 'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335',
1141 'upload_date': '20150110',
1142 'uploader': 'Various Artists - Topic',
1143 'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w',
1144 'artist': 'Disclosure',
1145 'track': 'Latch Feat. Sam Smith',
1146 'album': 'Latch Featuring Sam Smith',
1147 'release_date': '20121008',
1148 'release_year': 2012,
1149 },
1150 'params': {
1151 'skip_download': True,
1152 },
1153 },
1154 {
822b9d9c 1155 # Youtube Music Auto-generated description
5caabd3c 1156 # handle multiple artists on youtube music video
1157 'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA',
1158 'info_dict': {
1159 'id': '74qn0eJSjpA',
1160 'ext': 'mp4',
1161 'title': 'Eastside',
1162 'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2',
1163 'upload_date': '20180710',
1164 'uploader': 'Benny Blanco - Topic',
1165 'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A',
1166 'artist': 'benny blanco, Halsey, Khalid',
1167 'track': 'Eastside',
1168 'album': 'Eastside',
1169 'release_date': '20180713',
1170 'release_year': 2018,
1171 },
1172 'params': {
1173 'skip_download': True,
1174 },
1175 },
1176 {
822b9d9c 1177 # Youtube Music Auto-generated description
5caabd3c 1178 # handle youtube music video with release_year and no release_date
1179 'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M',
1180 'info_dict': {
1181 'id': '-hcAI0g-f5M',
1182 'ext': 'mp4',
1183 'title': 'Put It On Me',
5429d6a9 1184 'description': 'md5:f6422397c07c4c907c6638e1fee380a5',
5caabd3c 1185 'upload_date': '20180426',
1186 'uploader': 'Matt Maeson - Topic',
1187 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ',
1188 'artist': 'Matt Maeson',
1189 'track': 'Put It On Me',
1190 'album': 'The Hearse',
1191 'release_date': None,
1192 'release_year': 2018,
1193 },
1194 'params': {
1195 'skip_download': True,
1196 },
1197 },
66b48727
RA
1198 {
1199 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1200 'only_matching': True,
1201 },
011e75e6
S
1202 {
1203 # invalid -> valid video id redirection
1204 'url': 'DJztXj2GPfl',
1205 'info_dict': {
1206 'id': 'DJztXj2GPfk',
1207 'ext': 'mp4',
1208 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1209 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1210 'upload_date': '20090125',
1211 'uploader': 'Prochorowka',
1212 'uploader_id': 'Prochorowka',
1213 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1214 'artist': 'Panjabi MC',
1215 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1216 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1217 },
1218 'params': {
1219 'skip_download': True,
1220 },
ea74e00b
DP
1221 },
1222 {
1223 # empty description results in an empty string
1224 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1225 'info_dict': {
1226 'id': 'x41yOUIvK2k',
1227 'ext': 'mp4',
1228 'title': 'IMG 3456',
1229 'description': '',
1230 'upload_date': '20170613',
1231 'uploader_id': 'ElevageOrVert',
1232 'uploader': 'ElevageOrVert',
1233 },
1234 'params': {
1235 'skip_download': True,
1236 },
1237 },
2eb88d95
PH
1238 ]
1239
e0df6211
PH
1240 def __init__(self, *args, **kwargs):
1241 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 1242 self._player_cache = {}
e0df6211 1243
c5e8d7af
PH
1244 def report_video_info_webpage_download(self, video_id):
1245 """Report attempt to download video info webpage."""
69ea8ca4 1246 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 1247
c5e8d7af
PH
1248 def report_information_extraction(self, video_id):
1249 """Report attempt to extract video information."""
69ea8ca4 1250 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
1251
1252 def report_unavailable_format(self, video_id, format):
1253 """Report extracted video URL."""
69ea8ca4 1254 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
1255
1256 def report_rtmp_download(self):
1257 """Indicate the download will use the RTMP protocol."""
69ea8ca4 1258 self.to_screen('RTMP download detected')
c5e8d7af 1259
60064c53
PH
1260 def _signature_cache_id(self, example_sig):
1261 """ Return a string representation of a signature """
78caa52a 1262 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53 1263
e40c758c
S
1264 @classmethod
1265 def _extract_player_info(cls, player_url):
1266 for player_re in cls._PLAYER_INFO_RE:
1267 id_m = re.search(player_re, player_url)
1268 if id_m:
1269 break
1270 else:
c081b35c 1271 raise ExtractorError('Cannot identify player %r' % player_url)
e40c758c
S
1272 return id_m.group('ext'), id_m.group('id')
1273
1274 def _extract_signature_function(self, video_id, player_url, example_sig):
1275 player_type, player_id = self._extract_player_info(player_url)
e0df6211 1276
c4417ddb 1277 # Read from filesystem cache
60064c53
PH
1278 func_id = '%s_%s_%s' % (
1279 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 1280 assert os.path.basename(func_id) == func_id
a0e07d31 1281
69ea8ca4 1282 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 1283 if cache_spec is not None:
78caa52a 1284 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 1285
6d1a55a5
PH
1286 download_note = (
1287 'Downloading player %s' % player_url
1288 if self._downloader.params.get('verbose') else
1289 'Downloading %s player %s' % (player_type, player_id)
1290 )
e0df6211
PH
1291 if player_type == 'js':
1292 code = self._download_webpage(
1293 player_url, video_id,
6d1a55a5 1294 note=download_note,
69ea8ca4 1295 errnote='Download of %s failed' % player_url)
83799698 1296 res = self._parse_sig_js(code)
c4417ddb 1297 elif player_type == 'swf':
e0df6211
PH
1298 urlh = self._request_webpage(
1299 player_url, video_id,
6d1a55a5 1300 note=download_note,
69ea8ca4 1301 errnote='Download of %s failed' % player_url)
e0df6211 1302 code = urlh.read()
83799698 1303 res = self._parse_sig_swf(code)
e0df6211
PH
1304 else:
1305 assert False, 'Invalid player type %r' % player_type
1306
785521bf
PH
1307 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1308 cache_res = res(test_string)
1309 cache_spec = [ord(c) for c in cache_res]
83799698 1310
69ea8ca4 1311 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
1312 return res
1313
60064c53 1314 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
1315 def gen_sig_code(idxs):
1316 def _genslice(start, end, step):
78caa52a 1317 starts = '' if start == 0 else str(start)
8bcc8756 1318 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 1319 steps = '' if step == 1 else (':%d' % step)
78caa52a 1320 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
1321
1322 step = None
7af808a5
PH
1323 # Quelch pyflakes warnings - start will be set when step is set
1324 start = '(Never used)'
edf3e38e
PH
1325 for i, prev in zip(idxs[1:], idxs[:-1]):
1326 if step is not None:
1327 if i - prev == step:
1328 continue
1329 yield _genslice(start, prev, step)
1330 step = None
1331 continue
1332 if i - prev in [-1, 1]:
1333 step = i - prev
1334 start = prev
1335 continue
1336 else:
78caa52a 1337 yield 's[%d]' % prev
edf3e38e 1338 if step is None:
78caa52a 1339 yield 's[%d]' % i
edf3e38e
PH
1340 else:
1341 yield _genslice(start, i, step)
1342
78caa52a 1343 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 1344 cache_res = func(test_string)
edf3e38e 1345 cache_spec = [ord(c) for c in cache_res]
78caa52a 1346 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
1347 signature_id_tuple = '(%s)' % (
1348 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 1349 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 1350 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 1351 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 1352
e0df6211
PH
1353 def _parse_sig_js(self, jscode):
1354 funcname = self._search_regex(
abefc03f
S
1355 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1356 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
e450f6cb 1357 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
31ce6e99 1358 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
abefc03f
S
1359 # Obsolete patterns
1360 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
9a47fa35 1361 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
abefc03f
S
1362 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1363 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1364 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1365 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1366 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1367 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
3c90cc8b 1368 jscode, 'Initial JS player signature function name', group='sig')
2b25cb5d
PH
1369
1370 jsi = JSInterpreter(jscode)
1371 initial_function = jsi.extract_function(funcname)
e0df6211
PH
1372 return lambda s: initial_function([s])
1373
1374 def _parse_sig_swf(self, file_contents):
54256267 1375 swfi = SWFInterpreter(file_contents)
78caa52a 1376 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 1377 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 1378 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
1379 return lambda s: initial_function([s])
1380
83799698 1381 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 1382 """Turn the encrypted s field into a working signature"""
6b37f0be 1383
c8bf86d5 1384 if player_url is None:
69ea8ca4 1385 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 1386
69ea8ca4 1387 if player_url.startswith('//'):
78caa52a 1388 player_url = 'https:' + player_url
3c90cc8b
S
1389 elif not re.match(r'https?://', player_url):
1390 player_url = compat_urlparse.urljoin(
1391 'https://www.youtube.com', player_url)
c8bf86d5 1392 try:
62af3a0e 1393 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
1394 if player_id not in self._player_cache:
1395 func = self._extract_signature_function(
60064c53 1396 video_id, player_url, s
c8bf86d5
PH
1397 )
1398 self._player_cache[player_id] = func
1399 func = self._player_cache[player_id]
1400 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 1401 self._print_sig_code(func, s)
c8bf86d5
PH
1402 return func(s)
1403 except Exception as e:
1404 tb = traceback.format_exc()
1405 raise ExtractorError(
78caa52a 1406 'Signature extraction failed: ' + tb, cause=e)
e0df6211 1407
f96f5dda 1408 def _get_subtitles(self, video_id, webpage, has_live_chat_replay):
de7f3446 1409 try:
60e47a26 1410 subs_doc = self._download_xml(
38c2e5b8 1411 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
1412 video_id, note=False)
1413 except ExtractorError as err:
9b9c5355 1414 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
de7f3446 1415 return {}
de7f3446
JMF
1416
1417 sub_lang_list = {}
60e47a26
JMF
1418 for track in subs_doc.findall('track'):
1419 lang = track.attrib['lang_code']
7e660ac1
LD
1420 if lang in sub_lang_list:
1421 continue
360e1ca5 1422 sub_formats = []
23d17e4b 1423 for ext in self._SUBTITLE_FORMATS:
15707c7e 1424 params = compat_urllib_parse_urlencode({
360e1ca5
JMF
1425 'lang': lang,
1426 'v': video_id,
1427 'fmt': ext,
1428 'name': track.attrib['name'].encode('utf-8'),
1429 })
1430 sub_formats.append({
1431 'url': 'https://www.youtube.com/api/timedtext?' + params,
1432 'ext': ext,
1433 })
1434 sub_lang_list[lang] = sub_formats
9f448fcb 1435 if has_live_chat_replay:
321bf820 1436 sub_lang_list['live_chat'] = [
1437 {
1438 'video_id': video_id,
1439 'ext': 'json',
1440 'protocol': 'youtube_live_chat_replay',
1441 },
9f448fcb 1442 ]
de7f3446 1443 if not sub_lang_list:
69ea8ca4 1444 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
1445 return {}
1446 return sub_lang_list
1447
a72778d3
S
1448 def _get_ytplayer_config(self, video_id, webpage):
1449 patterns = (
526b3b07
S
1450 # User data may contain arbitrary character sequences that may affect
1451 # JSON extraction with regex, e.g. when '};' is contained the second
1452 # regex won't capture the whole JSON. Yet working around by trying more
1453 # concrete regex first keeping in mind proper quoted string handling
1454 # to be implemented in future that will replace this workaround (see
067aa17e
S
1455 # https://github.com/ytdl-org/youtube-dl/issues/7468,
1456 # https://github.com/ytdl-org/youtube-dl/pull/7599)
a72778d3
S
1457 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1458 r';ytplayer\.config\s*=\s*({.+?});',
59c5fa91 1459 r'ytInitialPlayerResponse\s*=\s*({.+?});var meta'
a72778d3
S
1460 )
1461 config = self._search_regex(
1462 patterns, webpage, 'ytplayer.config', default=None)
1463 if config:
1464 return self._parse_json(
1465 uppercase_escape(config), video_id, fatal=False)
0e49d9a6 1466
9322f116 1467 def _get_music_metadata_from_yt_initial(self, yt_initial):
1468 music_metadata = []
1469 key_map = {
1470 'Album': 'album',
1471 'Artist': 'artist',
1472 'Song': 'track'
1473 }
1474 contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'])
1475 if type(contents) is list:
1476 for content in contents:
1477 music_track = {}
1478 if type(content) is not dict:
1479 continue
1480 videoSecondaryInfoRenderer = try_get(content, lambda x: x['videoSecondaryInfoRenderer'])
1481 if type(videoSecondaryInfoRenderer) is not dict:
1482 continue
1483 rows = try_get(videoSecondaryInfoRenderer, lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'])
1484 if type(rows) is not list:
1485 continue
1486 for row in rows:
1487 metadataRowRenderer = try_get(row, lambda x: x['metadataRowRenderer'])
1488 if type(metadataRowRenderer) is not dict:
1489 continue
1490 key = try_get(metadataRowRenderer, lambda x: x['title']['simpleText'])
1491 value = try_get(metadataRowRenderer, lambda x: x['contents'][0]['simpleText']) or \
1492 try_get(metadataRowRenderer, lambda x: x['contents'][0]['runs'][0]['text'])
1493 if type(key) is not str or type(value) is not str:
1494 continue
1495 if key in key_map:
1496 if key_map[key] in music_track:
1497 # we've started on a new track
1498 music_metadata.append(music_track)
1499 music_track = {}
1500 music_track[key_map[key]] = value
1501 if len(music_track.keys()):
1502 music_metadata.append(music_track)
1503 return music_metadata
1504
360e1ca5 1505 def _get_automatic_captions(self, video_id, webpage):
de7f3446
JMF
1506 """We need the webpage for getting the captions url, pass it as an
1507 argument to speed up the process."""
69ea8ca4 1508 self.to_screen('%s: Looking for automatic captions' % video_id)
a72778d3 1509 player_config = self._get_ytplayer_config(video_id, webpage)
78caa52a 1510 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
a72778d3 1511 if not player_config:
de7f3446
JMF
1512 self._downloader.report_warning(err_msg)
1513 return {}
de7f3446 1514 try:
59c5fa91
PO
1515 if "args" in player_config and "ttsurl" in player_config["args"]:
1516 args = player_config['args']
1517 caption_url = args['ttsurl']
b78b292f 1518 timestamp = args['timestamp']
59c5fa91 1519
b78b292f 1520 # We get the available subtitles
15707c7e 1521 list_params = compat_urllib_parse_urlencode({
b78b292f
S
1522 'type': 'list',
1523 'tlangs': 1,
1524 'asrs': 1,
1525 })
1526 list_url = caption_url + '&' + list_params
1527 caption_list = self._download_xml(list_url, video_id)
1528 original_lang_node = caption_list.find('track')
1529 if original_lang_node is None:
1530 self._downloader.report_warning('Video doesn\'t have automatic captions')
1531 return {}
1532 original_lang = original_lang_node.attrib['lang_code']
1533 caption_kind = original_lang_node.attrib.get('kind', '')
1534
1535 sub_lang_list = {}
1536 for lang_node in caption_list.findall('target'):
1537 sub_lang = lang_node.attrib['lang_code']
1538 sub_formats = []
1539 for ext in self._SUBTITLE_FORMATS:
15707c7e 1540 params = compat_urllib_parse_urlencode({
b78b292f
S
1541 'lang': original_lang,
1542 'tlang': sub_lang,
1543 'fmt': ext,
1544 'ts': timestamp,
1545 'kind': caption_kind,
1546 })
1547 sub_formats.append({
1548 'url': caption_url + '&' + params,
1549 'ext': ext,
1550 })
1551 sub_lang_list[sub_lang] = sub_formats
1552 return sub_lang_list
1553
ddbb4c5c
S
1554 def make_captions(sub_url, sub_langs):
1555 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1556 caption_qs = compat_parse_qs(parsed_sub_url.query)
1557 captions = {}
1558 for sub_lang in sub_langs:
1559 sub_formats = []
1560 for ext in self._SUBTITLE_FORMATS:
1561 caption_qs.update({
1562 'tlang': [sub_lang],
1563 'fmt': [ext],
1564 })
1565 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1566 query=compat_urllib_parse_urlencode(caption_qs, True)))
1567 sub_formats.append({
1568 'url': sub_url,
1569 'ext': ext,
1570 })
1571 captions[sub_lang] = sub_formats
1572 return captions
1573
1574 # New captions format as of 22.06.2017
59c5fa91
PO
1575 if "args" in player_config:
1576 player_response = player_config["args"].get('player_response')
1577 else:
1578 # New player system (ytInitialPlayerResponse) as of October 2020
1579 player_response = player_config
1580
1581 if player_response:
1582 if isinstance(player_response, compat_str):
1583 player_response = self._parse_json(
1584 player_response, video_id, fatal=False)
1585
1586 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1587 caption_tracks = renderer['captionTracks']
1588 for caption_track in caption_tracks:
1589 if 'kind' not in caption_track:
1590 # not an automatic transcription
1591 continue
1592 base_url = caption_track['baseUrl']
1593 sub_lang_list = []
1594 for lang in renderer['translationLanguages']:
1595 lang_code = lang.get('languageCode')
1596 if lang_code:
1597 sub_lang_list.append(lang_code)
1598 return make_captions(base_url, sub_lang_list)
1599
1600 self._downloader.report_warning("Couldn't find automatic captions for %s" % video_id)
1601 return {}
1602
1603 if "args" in player_config:
1604 args = player_config["args"]
1605
1606 # Some videos don't provide ttsurl but rather caption_tracks and
1607 # caption_translation_languages (e.g. 20LmZk1hakA)
1608 # Does not used anymore as of 22.06.2017
1609 caption_tracks = args['caption_tracks']
1610 caption_translation_languages = args['caption_translation_languages']
1611 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
1612 sub_lang_list = []
1613 for lang in caption_translation_languages.split(','):
1614 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1615 sub_lang = lang_qs.get('lc', [None])[0]
1616 if sub_lang:
1617 sub_lang_list.append(sub_lang)
1618 return make_captions(caption_url, sub_lang_list)
de7f3446
JMF
1619 # An extractor error can be raise by the download process if there are
1620 # no automatic captions but there are subtitles
ddbb4c5c 1621 except (KeyError, IndexError, ExtractorError):
de7f3446
JMF
1622 self._downloader.report_warning(err_msg)
1623 return {}
1624
21c340b8
S
1625 def _mark_watched(self, video_id, video_info, player_response):
1626 playback_url = url_or_none(try_get(
1627 player_response,
1628 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
1629 video_info, lambda x: x['videostats_playback_base_url'][0]))
d77ab8e2
S
1630 if not playback_url:
1631 return
1632 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1633 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1634
1635 # cpn generation algorithm is reverse engineered from base.js.
1636 # In fact it works even with dummy cpn.
1637 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1638 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1639
1640 qs.update({
1641 'ver': ['2'],
1642 'cpn': [cpn],
1643 })
1644 playback_url = compat_urlparse.urlunparse(
15707c7e 1645 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
d77ab8e2
S
1646
1647 self._download_webpage(
1648 playback_url, video_id, 'Marking watched',
1649 'Unable to mark watched', fatal=False)
1650
66c9fa36
S
1651 @staticmethod
1652 def _extract_urls(webpage):
1653 # Embedded YouTube player
1654 entries = [
1655 unescapeHTML(mobj.group('url'))
1656 for mobj in re.finditer(r'''(?x)
1657 (?:
1658 <iframe[^>]+?src=|
1659 data-video-url=|
1660 <embed[^>]+?src=|
1661 embedSWF\(?:\s*|
1662 <object[^>]+data=|
1663 new\s+SWFObject\(
1664 )
1665 (["\'])
1666 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
f2332f18 1667 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
66c9fa36
S
1668 \1''', webpage)]
1669
1670 # lazyYT YouTube embed
1671 entries.extend(list(map(
1672 unescapeHTML,
1673 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1674
1675 # Wordpress "YouTube Video Importer" plugin
1676 matches = re.findall(r'''(?x)<div[^>]+
1677 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1678 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1679 entries.extend(m[-1] for m in matches)
1680
1681 return entries
1682
1683 @staticmethod
1684 def _extract_url(webpage):
1685 urls = YoutubeIE._extract_urls(webpage)
1686 return urls[0] if urls else None
1687
97665381
PH
1688 @classmethod
1689 def extract_id(cls, url):
1690 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 1691 if mobj is None:
69ea8ca4 1692 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
1693 video_id = mobj.group(2)
1694 return video_id
1695
84213ea8
S
1696 def _extract_chapters_from_json(self, webpage, video_id, duration):
1697 if not webpage:
1698 return
edd83104 1699 initial_data = self._parse_json(
84213ea8 1700 self._search_regex(
edd83104 1701 r'window\["ytInitialData"\] = (.+);\n', webpage,
84213ea8
S
1702 'player args', default='{}'),
1703 video_id, fatal=False)
edd83104 1704 if not initial_data or not isinstance(initial_data, dict):
84213ea8
S
1705 return
1706 chapters_list = try_get(
edd83104 1707 initial_data,
84213ea8
S
1708 lambda x: x['playerOverlays']
1709 ['playerOverlayRenderer']
1710 ['decoratedPlayerBarRenderer']
1711 ['decoratedPlayerBarRenderer']
1712 ['playerBar']
1713 ['chapteredPlayerBarRenderer']
1714 ['chapters'],
1715 list)
1716 if not chapters_list:
1717 return
1718
1719 def chapter_time(chapter):
1720 return float_or_none(
1721 try_get(
1722 chapter,
1723 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1724 int),
1725 scale=1000)
1726 chapters = []
1727 for next_num, chapter in enumerate(chapters_list, start=1):
1728 start_time = chapter_time(chapter)
1729 if start_time is None:
1730 continue
1731 end_time = (chapter_time(chapters_list[next_num])
1732 if next_num < len(chapters_list) else duration)
1733 if end_time is None:
1734 continue
1735 title = try_get(
1736 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1737 compat_str)
1738 chapters.append({
1739 'start_time': start_time,
1740 'end_time': end_time,
1741 'title': title,
1742 })
1743 return chapters
1744
9cafc3fd 1745 @staticmethod
84213ea8 1746 def _extract_chapters_from_description(description, duration):
9cafc3fd
S
1747 if not description:
1748 return None
1749 chapter_lines = re.findall(
1750 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1751 description)
1752 if not chapter_lines:
1753 return None
1754 chapters = []
1755 for next_num, (chapter_line, time_point) in enumerate(
1756 chapter_lines, start=1):
1757 start_time = parse_duration(time_point)
1758 if start_time is None:
1759 continue
39d4c1be
S
1760 if start_time > duration:
1761 break
9cafc3fd
S
1762 end_time = (duration if next_num == len(chapter_lines)
1763 else parse_duration(chapter_lines[next_num][1]))
1764 if end_time is None:
1765 continue
39d4c1be
S
1766 if end_time > duration:
1767 end_time = duration
1768 if start_time > end_time:
1769 break
9cafc3fd
S
1770 chapter_title = re.sub(
1771 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1772 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1773 chapters.append({
1774 'start_time': start_time,
1775 'end_time': end_time,
1776 'title': chapter_title,
1777 })
1778 return chapters
1779
84213ea8
S
1780 def _extract_chapters(self, webpage, description, video_id, duration):
1781 return (self._extract_chapters_from_json(webpage, video_id, duration)
1782 or self._extract_chapters_from_description(description, duration))
1783
c5e8d7af 1784 def _real_extract(self, url):
cf7e015f
S
1785 url, smuggled_data = unsmuggle_url(url, {})
1786
7e8c0af0 1787 proto = (
78caa52a
PH
1788 'http' if self._downloader.params.get('prefer_insecure', False)
1789 else 'https')
7e8c0af0 1790
7c80519c 1791 start_time = None
297a564b 1792 end_time = None
7c80519c
JMF
1793 parsed_url = compat_urllib_parse_urlparse(url)
1794 for component in [parsed_url.fragment, parsed_url.query]:
1795 query = compat_parse_qs(component)
297a564b 1796 if start_time is None and 't' in query:
7c80519c 1797 start_time = parse_duration(query['t'][0])
2929fa0e
JMF
1798 if start_time is None and 'start' in query:
1799 start_time = parse_duration(query['start'][0])
297a564b
JMF
1800 if end_time is None and 'end' in query:
1801 end_time = parse_duration(query['end'][0])
7c80519c 1802
c5e8d7af
PH
1803 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1804 mobj = re.search(self._NEXT_URL_RE, url)
1805 if mobj:
7fd002c0 1806 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
97665381 1807 video_id = self.extract_id(url)
c5e8d7af
PH
1808
1809 # Get video webpage
aa79ac0c 1810 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
011e75e6
S
1811 video_webpage, urlh = self._download_webpage_handle(url, video_id)
1812
1813 qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query)
1814 video_id = qs.get('v', [None])[0] or video_id
c5e8d7af
PH
1815
1816 # Attempt to extract SWF player URL
e0df6211 1817 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1818 if mobj is not None:
1819 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1820 else:
1821 player_url = None
1822
d8d24a92
S
1823 dash_mpds = []
1824
1825 def add_dash_mpd(video_info):
1826 dash_mpd = video_info.get('dashmpd')
1827 if dash_mpd and dash_mpd[0] not in dash_mpds:
1828 dash_mpds.append(dash_mpd[0])
1829
561b456e
S
1830 def add_dash_mpd_pr(pl_response):
1831 dash_mpd = url_or_none(try_get(
1832 pl_response, lambda x: x['streamingData']['dashManifestUrl'],
1833 compat_str))
1834 if dash_mpd and dash_mpd not in dash_mpds:
1835 dash_mpds.append(dash_mpd)
1836
c7121fa7
S
1837 is_live = None
1838 view_count = None
1839
1840 def extract_view_count(v_info):
1841 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1842
c2d125d9
S
1843 def extract_player_response(player_response, video_id):
1844 pl_response = str_or_none(player_response)
1845 if not pl_response:
1846 return
1847 pl_response = self._parse_json(pl_response, video_id, fatal=False)
1848 if isinstance(pl_response, dict):
1849 add_dash_mpd_pr(pl_response)
1850 return pl_response
1851
fb2c9277
U
1852 def extract_embedded_config(embed_webpage, video_id):
1853 embedded_config = self._search_regex(
1854 r'setConfig\(({.*})\);',
1855 embed_webpage, 'ytInitialData', default=None)
1856 if embedded_config:
1857 return embedded_config
1858
dbdaaa23
S
1859 player_response = {}
1860
c5e8d7af 1861 # Get video info
43ebf77d 1862 video_info = {}
6449cd80 1863 embed_webpage = None
39e7107d
U
1864 if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+'
1865 or re.search(r'player-age-gate-content">', video_webpage) is not None):
9d9314cb 1866 cookie_keys = self._get_cookies('https://www.youtube.com').keys()
c108eb73
JMF
1867 age_gate = True
1868 # We simulate the access to the video from www.youtube.com/v/{video_id}
1869 # this can be viewed without login into Youtube
beb95e77
CL
1870 url = proto + '://www.youtube.com/embed/%s' % video_id
1871 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
fb2c9277
U
1872 ext = extract_embedded_config(embed_webpage, video_id)
1873 # playabilityStatus = re.search(r'{\\\"status\\\":\\\"(?P<playabilityStatus>[^\"]+)\\\"', ext)
1874 playable_in_embed = re.search(r'{\\\"playableInEmbed\\\":(?P<playableinEmbed>[^\,]+)', ext)
1875 if not playable_in_embed:
1876 self.to_screen('Could not determine whether playabale in embed for video %s' % video_id)
1877 playable_in_embed = ''
1878 else:
1879 playable_in_embed = playable_in_embed.group('playableinEmbed')
1880 # check if video is only playable on youtube in other words not playable in embed - if so it requires auth (cookies)
1881 # if re.search(r'player-unavailable">', embed_webpage) is not None:
1882 if playable_in_embed == 'false':
c73baf23
U
1883 '''
1884 # TODO apply this patch when Support for Python 2.6(!) and above drops
9d9314cb 1885 if ({'VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID'} <= cookie_keys
4bb9c880 1886 or {'VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO'} <= cookie_keys):
c73baf23
U
1887 '''
1888 if (set(('VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID')) <= set(cookie_keys)
1889 or set(('VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO')) <= set(cookie_keys)):
4bb9c880
U
1890 age_gate = False
1891 # Try looking directly into the video webpage
1892 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1893 if ytplayer_config:
59c5fa91
PO
1894 args = ytplayer_config.get("args")
1895 if args is not None:
1896 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1897 # Convert to the same format returned by compat_parse_qs
1898 video_info = dict((k, [v]) for k, v in args.items())
1899 add_dash_mpd(video_info)
1900 # Rental video is not rented but preview is available (e.g.
1901 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1902 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1903 if not video_info and args.get('ypc_vid'):
1904 return self.url_result(
1905 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1906 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1907 is_live = True
1908 if not player_response:
1909 player_response = extract_player_response(args.get('player_response'), video_id)
1910 elif not player_response:
1911 player_response = ytplayer_config
4bb9c880
U
1912 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1913 add_dash_mpd_pr(player_response)
9d9314cb
U
1914 else:
1915 raise ExtractorError('Video is age restricted and only playable on Youtube. Requires cookies!', expected=True)
1916 else:
1917 data = compat_urllib_parse_urlencode({
1918 'video_id': video_id,
1919 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1920 'sts': self._search_regex(
1921 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1922 })
1923 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1924 try:
1925 video_info_webpage = self._download_webpage(
1926 video_info_url, video_id,
1927 note='Refetching age-gated info webpage',
1928 errnote='unable to download video info webpage')
1929 except ExtractorError:
1930 video_info_webpage = None
1931 if video_info_webpage:
1932 video_info = compat_parse_qs(video_info_webpage)
1933 pl_response = video_info.get('player_response', [None])[0]
1934 player_response = extract_player_response(pl_response, video_id)
1935 add_dash_mpd(video_info)
1936 view_count = extract_view_count(video_info)
c108eb73
JMF
1937 else:
1938 age_gate = False
d8d24a92 1939 # Try looking directly into the video webpage
a72778d3 1940 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
59c5fa91
PO
1941 args = ytplayer_config.get("args")
1942 if args is not None:
4c76aa06 1943 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
d8d24a92
S
1944 # Convert to the same format returned by compat_parse_qs
1945 video_info = dict((k, [v]) for k, v in args.items())
1946 add_dash_mpd(video_info)
6496ccb4
S
1947 # Rental video is not rented but preview is available (e.g.
1948 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
067aa17e 1949 # https://github.com/ytdl-org/youtube-dl/issues/10532)
6496ccb4
S
1950 if not video_info and args.get('ypc_vid'):
1951 return self.url_result(
1952 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
2fe1ff85
JMF
1953 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1954 is_live = True
dbdaaa23 1955 if not player_response:
c2d125d9 1956 player_response = extract_player_response(args.get('player_response'), video_id)
59c5fa91
PO
1957 elif not player_response:
1958 player_response = ytplayer_config
0a3cf9ad 1959 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
561b456e 1960 add_dash_mpd_pr(player_response)
bbb7c3f7
YCH
1961
1962 def extract_unavailable_message():
0add33ab
S
1963 messages = []
1964 for tag, kind in (('h1', 'message'), ('div', 'submessage')):
1965 msg = self._html_search_regex(
1966 r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
1967 video_webpage, 'unavailable %s' % kind, default=None)
1968 if msg:
1969 messages.append(msg)
1970 if messages:
1971 return '\n'.join(messages)
bbb7c3f7 1972
f93abcf1 1973 if not video_info and not player_response:
15be3eb5
RA
1974 unavailable_message = extract_unavailable_message()
1975 if not unavailable_message:
1976 unavailable_message = 'Unable to extract video data'
1977 raise ExtractorError(
1978 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
1979
f93abcf1
S
1980 if not isinstance(video_info, dict):
1981 video_info = {}
1982
dbdaaa23
S
1983 video_details = try_get(
1984 player_response, lambda x: x['videoDetails'], dict) or {}
1985
37357d21
S
1986 microformat = try_get(
1987 player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}
1988
8dbf751a
RA
1989 video_title = video_info.get('title', [None])[0] or video_details.get('title')
1990 if not video_title:
cf7e015f
S
1991 self._downloader.report_warning('Unable to extract video title')
1992 video_title = '_'
1993
9cafc3fd 1994 description_original = video_description = get_element_by_id("eow-description", video_webpage)
cf7e015f 1995 if video_description:
fa4bc6e7
RA
1996
1997 def replace_url(m):
1998 redir_url = compat_urlparse.urljoin(url, m.group(1))
1999 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
2000 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
2001 qs = compat_parse_qs(parsed_redir_url.query)
2002 q = qs.get('q')
2003 if q and q[0]:
2004 return q[0]
2005 return redir_url
2006
9cafc3fd 2007 description_original = video_description = re.sub(r'''(?x)
cf7e015f 2008 <a\s+
25cb7a0e 2009 (?:[a-zA-Z-]+="[^"]*"\s+)*?
23f13e97 2010 (?:title|href)="([^"]+)"\s+
25cb7a0e 2011 (?:[a-zA-Z-]+="[^"]*"\s+)*?
525cedb9 2012 class="[^"]*"[^>]*>
23f13e97 2013 [^<]+\.{3}\s*
cf7e015f 2014 </a>
fa4bc6e7 2015 ''', replace_url, video_description)
cf7e015f
S
2016 video_description = clean_html(video_description)
2017 else:
ea74e00b
DP
2018 video_description = video_details.get('shortDescription')
2019 if video_description is None:
2020 video_description = self._html_search_meta('description', video_webpage)
cf7e015f 2021
8fe10494 2022 if not smuggled_data.get('force_singlefeed', False):
5e1eddb9 2023 if not self._downloader.params.get('noplaylist'):
8fe10494
S
2024 multifeed_metadata_list = try_get(
2025 player_response,
2026 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
2027 compat_str) or try_get(
2028 video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
2029 if multifeed_metadata_list:
2030 entries = []
2031 feed_ids = []
2032 for feed in multifeed_metadata_list.split(','):
2033 # Unquote should take place before split on comma (,) since textual
2034 # fields may contain comma as well (see
067aa17e 2035 # https://github.com/ytdl-org/youtube-dl/issues/8536)
8fe10494 2036 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
6b09401b
S
2037
2038 def feed_entry(name):
2039 return try_get(feed_data, lambda x: x[name][0], compat_str)
2040
2041 feed_id = feed_entry('id')
2042 if not feed_id:
2043 continue
2044 feed_title = feed_entry('title')
2045 title = video_title
2046 if feed_title:
2047 title += ' (%s)' % feed_title
8fe10494
S
2048 entries.append({
2049 '_type': 'url_transparent',
2050 'ie_key': 'Youtube',
2051 'url': smuggle_url(
2052 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
2053 {'force_singlefeed': True}),
6b09401b 2054 'title': title,
8fe10494 2055 })
6b09401b 2056 feed_ids.append(feed_id)
8fe10494
S
2057 self.to_screen(
2058 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
2059 % (', '.join(feed_ids), video_id))
2060 return self.playlist_result(entries, video_id, video_title, video_description)
2061 else:
2062 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
cf7e015f 2063
c7121fa7 2064 if view_count is None:
1c9c8de2 2065 view_count = extract_view_count(video_info)
dbdaaa23
S
2066 if view_count is None and video_details:
2067 view_count = int_or_none(video_details.get('viewCount'))
7b16239a
S
2068 if view_count is None and microformat:
2069 view_count = int_or_none(microformat.get('viewCount'))
1d699755 2070
27019dbb 2071 if is_live is None:
898238e9 2072 is_live = bool_or_none(video_details.get('isLive'))
27019dbb 2073
321bf820 2074 has_live_chat_replay = False
f0f76a33 2075 if not is_live:
321bf820 2076 yt_initial_data = self._get_yt_initial_data(video_id, video_webpage)
2077 try:
2078 yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
2079 has_live_chat_replay = True
f0f76a33 2080 except (KeyError, IndexError, TypeError):
321bf820 2081 pass
2082
c5e8d7af
PH
2083 # Check for "rental" videos
2084 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
067aa17e 2085 raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
c5e8d7af 2086
c63ca0ee
S
2087 def _extract_filesize(media_url):
2088 return int_or_none(self._search_regex(
2089 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
2090
bf1317d2
S
2091 streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
2092 streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
2093
c5e8d7af
PH
2094 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
2095 self.report_rtmp_download()
dd27fd17
PH
2096 formats = [{
2097 'format_id': '_rtmp',
2098 'protocol': 'rtmp',
2099 'url': video_info['conn'][0],
2100 'player_url': player_url,
2101 }]
bf1317d2 2102 elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
5f6a1245 2103 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 2104 if 'rtmpe%3Dyes' in encoded_url_map:
067aa17e 2105 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
bf1317d2 2106 formats = []
3318832e 2107 formats_spec = {}
82156fdb 2108 fmt_list = video_info.get('fmt_list', [''])[0]
2109 if fmt_list:
2110 for fmt in fmt_list.split(','):
2111 spec = fmt.split('/')
3318832e 2112 if len(spec) > 1:
2113 width_height = spec[1].split('x')
2114 if len(width_height) == 2:
2115 formats_spec[spec[0]] = {
2116 'resolution': spec[1],
2117 'width': int_or_none(width_height[0]),
2118 'height': int_or_none(width_height[1]),
2119 }
bf1317d2
S
2120 for fmt in streaming_formats:
2121 itag = str_or_none(fmt.get('itag'))
2122 if not itag:
201e9eaa 2123 continue
bf1317d2
S
2124 quality = fmt.get('quality')
2125 quality_label = fmt.get('qualityLabel') or quality
2126 formats_spec[itag] = {
2127 'asr': int_or_none(fmt.get('audioSampleRate')),
2128 'filesize': int_or_none(fmt.get('contentLength')),
2129 'format_note': quality_label,
2130 'fps': int_or_none(fmt.get('fps')),
2131 'height': int_or_none(fmt.get('height')),
bf1317d2
S
2132 # bitrate for itag 43 is always 2147483647
2133 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
2134 'width': int_or_none(fmt.get('width')),
2135 }
2136
2137 for fmt in streaming_formats:
00eb865b 2138 if fmt.get('drmFamilies') or fmt.get('drm_families'):
bf1317d2
S
2139 continue
2140 url = url_or_none(fmt.get('url'))
2141
2142 if not url:
fa3db383 2143 cipher = fmt.get('cipher') or fmt.get('signatureCipher')
bf1317d2
S
2144 if not cipher:
2145 continue
2146 url_data = compat_parse_qs(cipher)
2147 url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
2148 if not url:
2149 continue
2150 else:
2151 cipher = None
2152 url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2153
2f483bc1
S
2154 stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
2155 # Unsupported FORMAT_STREAM_TYPE_OTF
2156 if stream_type == 3:
2157 continue
6449cd80 2158
bf1317d2
S
2159 format_id = fmt.get('itag') or url_data['itag'][0]
2160 if not format_id:
2161 continue
2162 format_id = compat_str(format_id)
a49eccdf 2163
bf1317d2
S
2164 if cipher:
2165 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
67b19799 2166 ASSETS_RE = r'(?:"assets":.+?"js":\s*("[^"]+"))|(?:"jsUrl":\s*("[^"]+"))'
bf1317d2
S
2167 jsplayer_url_json = self._search_regex(
2168 ASSETS_RE,
2169 embed_webpage if age_gate else video_webpage,
2170 'JS player URL (1)', default=None)
2171 if not jsplayer_url_json and not age_gate:
2172 # We need the embed website after all
2173 if embed_webpage is None:
2174 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
2175 embed_webpage = self._download_webpage(
2176 embed_url, video_id, 'Downloading embed webpage')
2177 jsplayer_url_json = self._search_regex(
2178 ASSETS_RE, embed_webpage, 'JS player URL')
2179
2180 player_url = json.loads(jsplayer_url_json)
cf010131 2181 if player_url is None:
bf1317d2
S
2182 player_url_json = self._search_regex(
2183 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
2184 video_webpage, 'age gate player URL')
2185 player_url = json.loads(player_url_json)
2186
2187 if 'sig' in url_data:
2188 url += '&signature=' + url_data['sig'][0]
2189 elif 's' in url_data:
2190 encrypted_sig = url_data['s'][0]
2191
2192 if self._downloader.params.get('verbose'):
2193 if player_url is None:
bf1317d2 2194 player_desc = 'unknown'
cf010131 2195 else:
e40c758c
S
2196 player_type, player_version = self._extract_player_info(player_url)
2197 player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version)
bf1317d2
S
2198 parts_sizes = self._signature_cache_id(encrypted_sig)
2199 self.to_screen('{%s} signature length %s, %s' %
2200 (format_id, parts_sizes, player_desc))
2201
2202 signature = self._decrypt_signature(
2203 encrypted_sig, video_id, player_url, age_gate)
2204 sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
2205 url += '&%s=%s' % (sp, signature)
201e9eaa
PH
2206 if 'ratebypass' not in url:
2207 url += '&ratebypass=yes'
c9afb51c 2208
94278f72
YCH
2209 dct = {
2210 'format_id': format_id,
2211 'url': url,
2212 'player_url': player_url,
2213 }
2214 if format_id in self._formats:
2215 dct.update(self._formats[format_id])
3318832e 2216 if format_id in formats_spec:
2217 dct.update(formats_spec[format_id])
94278f72 2218
aabc2be6 2219 # Some itags are not included in DASH manifest thus corresponding formats will
067aa17e 2220 # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).
aabc2be6
S
2221 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
2222 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
2223 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
94278f72 2224
bf1317d2
S
2225 if width is None:
2226 width = int_or_none(fmt.get('width'))
2227 if height is None:
2228 height = int_or_none(fmt.get('height'))
2229
c63ca0ee
S
2230 filesize = int_or_none(url_data.get(
2231 'clen', [None])[0]) or _extract_filesize(url)
2232
bf1317d2
S
2233 quality = url_data.get('quality', [None])[0] or fmt.get('quality')
2234 quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
2235
4878759f
S
2236 tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
2237 or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
bf1317d2 2238 fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
54fc90aa 2239
94278f72 2240 more_fields = {
c63ca0ee 2241 'filesize': filesize,
bf1317d2 2242 'tbr': tbr,
c9afb51c
AH
2243 'width': width,
2244 'height': height,
bf1317d2
S
2245 'fps': fps,
2246 'format_note': quality_label or quality,
c9afb51c 2247 }
94278f72
YCH
2248 for key, value in more_fields.items():
2249 if value:
2250 dct[key] = value
bf1317d2 2251 type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
aabc2be6
S
2252 if type_:
2253 type_split = type_.split(';')
2254 kind_ext = type_split[0].split('/')
2255 if len(kind_ext) == 2:
94278f72
YCH
2256 kind, _ = kind_ext
2257 dct['ext'] = mimetype2ext(type_split[0])
aabc2be6
S
2258 if kind in ('audio', 'video'):
2259 codecs = None
2260 for mobj in re.finditer(
2261 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
2262 if mobj.group('key') == 'codecs':
2263 codecs = mobj.group('val')
2264 break
2265 if codecs:
6310acf5 2266 dct.update(parse_codecs(codecs))
e4a60912
S
2267 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
2268 dct['downloader_options'] = {
2269 # Youtube throttles chunks >~10M
2270 'http_chunk_size': 10485760,
2271 }
aabc2be6 2272 formats.append(dct)
c5e8d7af 2273 else:
c3e54389
S
2274 manifest_url = (
2275 url_or_none(try_get(
2276 player_response,
2277 lambda x: x['streamingData']['hlsManifestUrl'],
3089bc74
S
2278 compat_str))
2279 or url_or_none(try_get(
c3e54389
S
2280 video_info, lambda x: x['hlsvp'][0], compat_str)))
2281 if manifest_url:
2282 formats = []
2283 m3u8_formats = self._extract_m3u8_formats(
2284 manifest_url, video_id, 'mp4', fatal=False)
2285 for a_format in m3u8_formats:
2286 itag = self._search_regex(
2287 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
2288 if itag:
2289 a_format['format_id'] = itag
2290 if itag in self._formats:
2291 dct = self._formats[itag].copy()
2292 dct.update(a_format)
2293 a_format = dct
2294 a_format['player_url'] = player_url
2295 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
2296 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
78895bd3
U
2297 if self._downloader.params.get('youtube_include_hls_manifest', True):
2298 formats.append(a_format)
c3e54389 2299 else:
13577349 2300 error_message = extract_unavailable_message()
c3e54389 2301 if not error_message:
13577349
S
2302 error_message = clean_html(try_get(
2303 player_response, lambda x: x['playabilityStatus']['reason'],
2304 compat_str))
2305 if not error_message:
2306 error_message = clean_html(
2307 try_get(video_info, lambda x: x['reason'][0], compat_str))
c3e54389
S
2308 if error_message:
2309 raise ExtractorError(error_message, expected=True)
2310 raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 2311
7e72694b 2312 # uploader
dbdaaa23
S
2313 video_uploader = try_get(
2314 video_info, lambda x: x['author'][0],
2315 compat_str) or str_or_none(video_details.get('author'))
7e72694b
S
2316 if video_uploader:
2317 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
2318 else:
2319 self._downloader.report_warning('unable to extract uploader name')
2320
2321 # uploader_id
2322 video_uploader_id = None
2323 video_uploader_url = None
2324 mobj = re.search(
2325 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
2326 video_webpage)
2327 if mobj is not None:
2328 video_uploader_id = mobj.group('uploader_id')
2329 video_uploader_url = mobj.group('uploader_url')
a6211d23
S
2330 else:
2331 owner_profile_url = url_or_none(microformat.get('ownerProfileUrl'))
2332 if owner_profile_url:
2333 video_uploader_id = self._search_regex(
2334 r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id',
2335 default=None)
2336 video_uploader_url = owner_profile_url
7e72694b 2337
b45a9e69 2338 channel_id = (
3089bc74
S
2339 str_or_none(video_details.get('channelId'))
2340 or self._html_search_meta(
2341 'channelId', video_webpage, 'channel id', default=None)
2342 or self._search_regex(
b45a9e69 2343 r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
2344 video_webpage, 'channel id', default=None, group='id'))
dd4c4492
S
2345 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
2346
b477fc13
S
2347 thumbnails = []
2348 thumbnails_list = try_get(
2349 video_details, lambda x: x['thumbnail']['thumbnails'], list) or []
2350 for t in thumbnails_list:
2351 if not isinstance(t, dict):
2352 continue
2353 thumbnail_url = url_or_none(t.get('url'))
2354 if not thumbnail_url:
2355 continue
2356 thumbnails.append({
2357 'url': thumbnail_url,
2358 'width': int_or_none(t.get('width')),
2359 'height': int_or_none(t.get('height')),
2360 })
2361
2362 if not thumbnails:
7e72694b 2363 video_thumbnail = None
b477fc13
S
2364 # We try first to get a high quality image:
2365 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
2366 video_webpage, re.DOTALL)
2367 if m_thumb is not None:
2368 video_thumbnail = m_thumb.group(1)
2369 thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str)
2370 if thumbnail_url:
2371 video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url)
2372 if video_thumbnail:
2373 thumbnails.append({'url': video_thumbnail})
7e72694b
S
2374
2375 # upload date
2376 upload_date = self._html_search_meta(
2377 'datePublished', video_webpage, 'upload date', default=None)
2378 if not upload_date:
2379 upload_date = self._search_regex(
2380 [r'(?s)id="eow-date.*?>(.*?)</span>',
2381 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
2382 video_webpage, 'upload date', default=None)
37357d21
S
2383 if not upload_date:
2384 upload_date = microformat.get('publishDate') or microformat.get('uploadDate')
7e72694b
S
2385 upload_date = unified_strdate(upload_date)
2386
2387 video_license = self._html_search_regex(
2388 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
2389 video_webpage, 'license', default=None)
2390
2391 m_music = re.search(
2392 r'''(?x)
2393 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
2394 <ul[^>]*>\s*
2395 <li>(?P<title>.+?)
2396 by (?P<creator>.+?)
2397 (?:
2398 \(.+?\)|
2399 <a[^>]*
2400 (?:
2401 \bhref=["\']/red[^>]*>| # drop possible
2402 >\s*Listen ad-free with YouTube Red # YouTube Red ad
2403 )
2404 .*?
2405 )?</li
2406 ''',
2407 video_webpage)
2408 if m_music:
2409 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
2410 video_creator = clean_html(m_music.group('creator'))
2411 else:
2412 video_alt_title = video_creator = None
2413
2414 def extract_meta(field):
2415 return self._html_search_regex(
2416 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
2417 video_webpage, field, default=None)
2418
2419 track = extract_meta('Song')
2420 artist = extract_meta('Artist')
92bc97d3 2421 album = extract_meta('Album')
822b9d9c
RA
2422
2423 # Youtube Music Auto-generated description
92bc97d3 2424 release_date = release_year = None
822b9d9c
RA
2425 if video_description:
2426 mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description)
2427 if mobj:
2428 if not track:
2429 track = mobj.group('track').strip()
2430 if not artist:
2431 artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))
92bc97d3
RA
2432 if not album:
2433 album = mobj.group('album'.strip())
822b9d9c
RA
2434 release_year = mobj.group('release_year')
2435 release_date = mobj.group('release_date')
2436 if release_date:
2437 release_date = release_date.replace('-', '')
2438 if not release_year:
2439 release_year = int(release_date[:4])
2440 if release_year:
2441 release_year = int(release_year)
7e72694b 2442
9322f116 2443 yt_initial = self._get_yt_initial_data(video_id, video_webpage)
2444 if yt_initial:
2445 music_metadata = self._get_music_metadata_from_yt_initial(yt_initial)
2446 if len(music_metadata):
2447 album = music_metadata[0].get('album')
2448 artist = music_metadata[0].get('artist')
2449 track = music_metadata[0].get('track')
2450
7e72694b
S
2451 m_episode = re.search(
2452 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
2453 video_webpage)
2454 if m_episode:
c2dd2dc0 2455 series = unescapeHTML(m_episode.group('series'))
7e72694b
S
2456 season_number = int(m_episode.group('season'))
2457 episode_number = int(m_episode.group('episode'))
2458 else:
2459 series = season_number = episode_number = None
2460
2461 m_cat_container = self._search_regex(
2462 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
2463 video_webpage, 'categories', default=None)
dbeafce5 2464 category = None
7e72694b
S
2465 if m_cat_container:
2466 category = self._html_search_regex(
2467 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
2468 default=None)
dbeafce5
S
2469 if not category:
2470 category = try_get(
2471 microformat, lambda x: x['category'], compat_str)
2472 video_categories = None if category is None else [category]
7e72694b
S
2473
2474 video_tags = [
2475 unescapeHTML(m.group('content'))
2476 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
dbeafce5
S
2477 if not video_tags:
2478 video_tags = try_get(video_details, lambda x: x['keywords'], list)
7e72694b
S
2479
2480 def _extract_count(count_name):
2481 return str_to_int(self._search_regex(
a6c666d0 2482 r'"accessibilityData":\{"label":"([\d,\w]+) %ss"\}'
7e72694b
S
2483 % re.escape(count_name),
2484 video_webpage, count_name, default=None))
2485
2486 like_count = _extract_count('like')
2487 dislike_count = _extract_count('dislike')
2488
dbdaaa23
S
2489 if view_count is None:
2490 view_count = str_to_int(self._search_regex(
2491 r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
2492 'view count', default=None))
2493
bf3c9326
S
2494 average_rating = (
2495 float_or_none(video_details.get('averageRating'))
2496 or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
2497
7e72694b 2498 # subtitles
321bf820 2499 video_subtitles = self.extract_subtitles(
2500 video_id, video_webpage, has_live_chat_replay)
7e72694b
S
2501 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
2502
2503 video_duration = try_get(
2504 video_info, lambda x: int_or_none(x['length_seconds'][0]))
dbdaaa23
S
2505 if not video_duration:
2506 video_duration = int_or_none(video_details.get('lengthSeconds'))
7e72694b
S
2507 if not video_duration:
2508 video_duration = parse_duration(self._html_search_meta(
2509 'duration', video_webpage, 'video duration'))
2510
b84071c0
JP
2511 # Get Subscriber Count of channel
2512 subscriber_count = parse_count(self._search_regex(
2513 r'"text":"([\d\.]+\w?) subscribers"',
2514 video_webpage,
2515 'subscriber count',
2516 default=None
2517 ))
2518
7e72694b
S
2519 # annotations
2520 video_annotations = None
2521 if self._downloader.params.get('writeannotations', False):
64b6a4e9
RA
2522 xsrf_token = self._search_regex(
2523 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2',
2524 video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
2525 invideo_url = try_get(
2526 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
2527 if xsrf_token and invideo_url:
2528 xsrf_field_name = self._search_regex(
2529 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
2530 video_webpage, 'xsrf field name',
2531 group='xsrf_field_name', default='session_token')
2532 video_annotations = self._download_webpage(
2533 self._proto_relative_url(invideo_url),
2534 video_id, note='Downloading annotations',
2535 errnote='Unable to download video annotations', fatal=False,
2536 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
7e72694b 2537
84213ea8 2538 chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
7e72694b 2539
dd27fd17 2540 # Look for the DASH manifest
203fb43f 2541 if self._downloader.params.get('youtube_include_dash_manifest', True):
77c6fb5b 2542 dash_mpd_fatal = True
8ff648e4 2543 for mpd_url in dash_mpds:
d8d24a92 2544 dash_formats = {}
774e208f 2545 try:
05d0d131
YCH
2546 def decrypt_sig(mobj):
2547 s = mobj.group(1)
2548 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2549 return '/signature/%s' % dec_s
2550
8ff648e4 2551 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2d2fa82d 2552
8ff648e4 2553 for df in self._extract_mpd_formats(
2554 mpd_url, video_id, fatal=dash_mpd_fatal,
2555 formats_dict=self._formats):
c63ca0ee
S
2556 if not df.get('filesize'):
2557 df['filesize'] = _extract_filesize(df['url'])
d8d24a92
S
2558 # Do not overwrite DASH format found in some previous DASH manifest
2559 if df['format_id'] not in dash_formats:
2560 dash_formats[df['format_id']] = df
77c6fb5b
S
2561 # Additional DASH manifests may end up in HTTP Error 403 therefore
2562 # allow them to fail without bug report message if we already have
2563 # some DASH manifest succeeded. This is temporary workaround to reduce
2564 # burst of bug reports until we figure out the reason and whether it
2565 # can be fixed at all.
2566 dash_mpd_fatal = False
774e208f
PH
2567 except (ExtractorError, KeyError) as e:
2568 self.report_warning(
2569 'Skipping DASH manifest: %r' % e, video_id)
d8d24a92 2570 if dash_formats:
04b3b3df
JMF
2571 # Remove the formats we found through non-DASH, they
2572 # contain less info and it can be wrong, because we use
2573 # fixed values (for example the resolution). See
067aa17e 2574 # https://github.com/ytdl-org/youtube-dl/issues/5774 for an
04b3b3df 2575 # example.
d80265cc 2576 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
d8d24a92 2577 formats.extend(dash_formats.values())
d80044c2 2578
6271f1ca
PH
2579 # Check for malformed aspect ratio
2580 stretched_m = re.search(
2581 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2582 video_webpage)
2583 if stretched_m:
313dfc45
LL
2584 w = float(stretched_m.group('w'))
2585 h = float(stretched_m.group('h'))
5faf9fed
S
2586 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2587 # We will only process correct ratios.
313dfc45 2588 if w > 0 and h > 0:
41f24c32 2589 ratio = w / h
313dfc45
LL
2590 for f in formats:
2591 if f.get('vcodec') != 'none':
2592 f['stretched_ratio'] = ratio
6271f1ca 2593
026fbedc 2594 if not formats:
43ebf77d
S
2595 if 'reason' in video_info:
2596 if 'The uploader has not made this video available in your country.' in video_info['reason']:
2597 regions_allowed = self._html_search_meta(
2598 'regionsAllowed', video_webpage, default=None)
2599 countries = regions_allowed.split(',') if regions_allowed else None
2600 self.raise_geo_restricted(
2601 msg=video_info['reason'][0], countries=countries)
2602 reason = video_info['reason'][0]
2603 if 'Invalid parameters' in reason:
2604 unavailable_message = extract_unavailable_message()
2605 if unavailable_message:
2606 reason = unavailable_message
2607 raise ExtractorError(
2608 'YouTube said: %s' % reason,
2609 expected=True, video_id=video_id)
2610 if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']):
2611 raise ExtractorError('This video is DRM protected.', expected=True)
0d297518 2612
4bcc7bd1 2613 self._sort_formats(formats)
4ea3be0a 2614
21c340b8 2615 self.mark_watched(video_id, video_info, player_response)
d77ab8e2 2616
4ea3be0a 2617 return {
8bcc8756
JW
2618 'id': video_id,
2619 'uploader': video_uploader,
2620 'uploader_id': video_uploader_id,
fd050249 2621 'uploader_url': video_uploader_url,
dd4c4492
S
2622 'channel_id': channel_id,
2623 'channel_url': channel_url,
8bcc8756 2624 'upload_date': upload_date,
7caf9830 2625 'license': video_license,
936784b2 2626 'creator': video_creator or artist,
8bcc8756 2627 'title': video_title,
936784b2 2628 'alt_title': video_alt_title or track,
b477fc13 2629 'thumbnails': thumbnails,
8bcc8756
JW
2630 'description': video_description,
2631 'categories': video_categories,
000b6b5a 2632 'tags': video_tags,
8bcc8756 2633 'subtitles': video_subtitles,
360e1ca5 2634 'automatic_captions': automatic_captions,
8bcc8756
JW
2635 'duration': video_duration,
2636 'age_limit': 18 if age_gate else 0,
2637 'annotations': video_annotations,
9cafc3fd 2638 'chapters': chapters,
7e8c0af0 2639 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 2640 'view_count': view_count,
4ea3be0a 2641 'like_count': like_count,
2642 'dislike_count': dislike_count,
bf3c9326 2643 'average_rating': average_rating,
8bcc8756 2644 'formats': formats,
2fe1ff85 2645 'is_live': is_live,
7c80519c 2646 'start_time': start_time,
297a564b 2647 'end_time': end_time,
12afdc2a
S
2648 'series': series,
2649 'season_number': season_number,
2650 'episode_number': episode_number,
936784b2
S
2651 'track': track,
2652 'artist': artist,
5caabd3c 2653 'album': album,
2654 'release_date': release_date,
2655 'release_year': release_year,
b84071c0 2656 'subscriber_count': subscriber_count,
4ea3be0a 2657 }
c5e8d7af 2658
5f6a1245 2659
8e7aad20 2660class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
78caa52a 2661 IE_DESC = 'YouTube.com playlists'
d67cc9fa 2662 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
2663 (?:https?://)?
2664 (?:\w+\.)?
c5e8d7af 2665 (?:
c0345b82 2666 (?:
66b48727 2667 youtube(?:kids)?\.com|
c0345b82
S
2668 invidio\.us
2669 )
2670 /
feaa5ad7 2671 (?:
87dadd45 2672 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))
feaa5ad7
S
2673 \? (?:.*?[&;])*? (?:p|a|list)=
2674 | p/
2675 )|
2676 youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
c5e8d7af 2677 )
d67cc9fa 2678 (
66b48727 2679 (?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)?[0-9A-Za-z-_]{10,}
5f6a1245 2680 # Top tracks, they can also include dots
d67cc9fa
JMF
2681 |(?:MC)[\w\.]*
2682 )
c5e8d7af
PH
2683 .*
2684 |
d0ba5587
S
2685 (%(playlist_id)s)
2686 )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
8d81f3e3 2687 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
351f37c0
S
2688 _VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&amp;(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?'
2689 _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})'
78caa52a 2690 IE_NAME = 'youtube:playlist'
7f4f0b21 2691 _YTM_PLAYLIST_PREFIX = 'RDCLAK5uy_'
2692 _YTM_CHANNEL_INFO = {
2693 'uploader': 'Youtube Music',
2694 'uploader_id': 'music', # or "UC-9-kyTW8ZkZNDHQJ6FgpwQ"
2695 'uploader_url': 'https://www.youtube.com/music'
2696 }
81127aa5 2697 _TESTS = [{
0e30a7b9 2698 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
81127aa5 2699 'info_dict': {
0e30a7b9 2700 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2701 'uploader': 'Sergey M.',
2702 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
3867038a 2703 'title': 'youtube-dl public playlist',
81127aa5 2704 },
0e30a7b9 2705 'playlist_count': 1,
9291475f 2706 }, {
0e30a7b9 2707 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
9291475f 2708 'info_dict': {
0e30a7b9 2709 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2710 'uploader': 'Sergey M.',
2711 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
3867038a 2712 'title': 'youtube-dl empty playlist',
9291475f
PH
2713 },
2714 'playlist_count': 0,
2715 }, {
2716 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2717 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2718 'info_dict': {
2719 'title': '29C3: Not my department',
acf757f4 2720 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
13a75688
S
2721 'uploader': 'Christiaan008',
2722 'uploader_id': 'ChRiStIaAn008',
9291475f 2723 },
0e30a7b9 2724 'playlist_count': 96,
9291475f
PH
2725 }, {
2726 'note': 'issue #673',
2727 'url': 'PLBB231211A4F62143',
2728 'info_dict': {
f46a8702 2729 'title': '[OLD]Team Fortress 2 (Class-based LP)',
acf757f4 2730 'id': 'PLBB231211A4F62143',
13a75688
S
2731 'uploader': 'Wickydoo',
2732 'uploader_id': 'Wickydoo',
9291475f
PH
2733 },
2734 'playlist_mincount': 26,
2735 }, {
2736 'note': 'Large playlist',
2737 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
2738 'info_dict': {
2739 'title': 'Uploads from Cauchemar',
acf757f4 2740 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
13a75688
S
2741 'uploader': 'Cauchemar',
2742 'uploader_id': 'Cauchemar89',
9291475f
PH
2743 },
2744 'playlist_mincount': 799,
2745 }, {
2746 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2747 'info_dict': {
2748 'title': 'YDL_safe_search',
acf757f4 2749 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
9291475f
PH
2750 },
2751 'playlist_count': 2,
4201ba13 2752 'skip': 'This playlist is private',
ac7553d0
PH
2753 }, {
2754 'note': 'embedded',
2d3d2997 2755 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
ac7553d0
PH
2756 'playlist_count': 4,
2757 'info_dict': {
2758 'title': 'JODA15',
acf757f4 2759 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
13a75688
S
2760 'uploader': 'milan',
2761 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
ac7553d0 2762 }
87dadd45
S
2763 }, {
2764 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2765 'playlist_mincount': 485,
2766 'info_dict': {
13a75688 2767 'title': '2018 Chinese New Singles (11/6 updated)',
87dadd45 2768 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
13a75688
S
2769 'uploader': 'LBK',
2770 'uploader_id': 'sdragonfang',
87dadd45 2771 }
6b08cdf6
PH
2772 }, {
2773 'note': 'Embedded SWF player',
2d3d2997 2774 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
6b08cdf6
PH
2775 'playlist_count': 4,
2776 'info_dict': {
2777 'title': 'JODA7',
acf757f4 2778 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
13a75688
S
2779 },
2780 'skip': 'This playlist does not exist',
4b7df0d3
JMF
2781 }, {
2782 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2783 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2784 'info_dict': {
acf757f4
PH
2785 'title': 'Uploads from Interstellar Movie',
2786 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
13a75688
S
2787 'uploader': 'Interstellar Movie',
2788 'uploader_id': 'InterstellarMovie1',
4b7df0d3 2789 },
481cc733 2790 'playlist_mincount': 21,
dacb3a86
S
2791 }, {
2792 # Playlist URL that does not actually serve a playlist
2793 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2794 'info_dict': {
2795 'id': 'FqZTN594JQw',
2796 'ext': 'webm',
2797 'title': "Smiley's People 01 detective, Adventure Series, Action",
2798 'uploader': 'STREEM',
2799 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
ec85ded8 2800 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
dacb3a86
S
2801 'upload_date': '20150526',
2802 'license': 'Standard YouTube License',
2803 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2804 'categories': ['People & Blogs'],
2805 'tags': list,
dbdaaa23 2806 'view_count': int,
dacb3a86
S
2807 'like_count': int,
2808 'dislike_count': int,
2809 },
2810 'params': {
2811 'skip_download': True,
2812 },
13a75688 2813 'skip': 'This video is not available.',
dacb3a86 2814 'add_ie': [YoutubeIE.ie_key()],
481cc733
S
2815 }, {
2816 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
2817 'info_dict': {
2818 'id': 'yeWKywCrFtk',
2819 'ext': 'mp4',
2820 'title': 'Small Scale Baler and Braiding Rugs',
2821 'uploader': 'Backus-Page House Museum',
2822 'uploader_id': 'backuspagemuseum',
ec85ded8 2823 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
481cc733 2824 'upload_date': '20161008',
481cc733
S
2825 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
2826 'categories': ['Nonprofits & Activism'],
2827 'tags': list,
2828 'like_count': int,
2829 'dislike_count': int,
2830 },
2831 'params': {
2832 'noplaylist': True,
2833 'skip_download': True,
2834 },
2e18adec
S
2835 }, {
2836 # https://github.com/ytdl-org/youtube-dl/issues/21844
2837 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2838 'info_dict': {
2839 'title': 'Data Analysis with Dr Mike Pound',
2840 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2841 'uploader_id': 'Computerphile',
2842 'uploader': 'Computerphile',
2843 },
2844 'playlist_mincount': 11,
feaa5ad7
S
2845 }, {
2846 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
2847 'only_matching': True,
a6857510
S
2848 }, {
2849 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
2850 'only_matching': True,
409b9324
S
2851 }, {
2852 # music album playlist
2853 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
2854 'only_matching': True,
c0345b82
S
2855 }, {
2856 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',
2857 'only_matching': True,
66b48727
RA
2858 }, {
2859 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
2860 'only_matching': True,
81127aa5 2861 }]
c5e8d7af 2862
880e1c52
JMF
2863 def _real_initialize(self):
2864 self._login()
2865
351f37c0
S
2866 def extract_videos_from_page(self, page):
2867 ids_in_page = []
2868 titles_in_page = []
2869
2870 for item in re.findall(
2871 r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page):
2872 attrs = extract_attributes(item)
2873 video_id = attrs['data-video-id']
2874 video_title = unescapeHTML(attrs.get('data-title'))
2875 if video_title:
2876 video_title = video_title.strip()
2877 ids_in_page.append(video_id)
2878 titles_in_page.append(video_title)
2879
2880 # Fallback with old _VIDEO_RE
2881 self.extract_videos_from_page_impl(
2882 self._VIDEO_RE, page, ids_in_page, titles_in_page)
2883
2884 # Relaxed fallbacks
2885 self.extract_videos_from_page_impl(
2886 r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page,
2887 ids_in_page, titles_in_page)
2888 self.extract_videos_from_page_impl(
2889 r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page,
2890 ids_in_page, titles_in_page)
2891
2892 return zip(ids_in_page, titles_in_page)
2893
5b0a6a80 2894 def _extract_mix_ids_from_yt_initial(self, yt_initial):
2895 ids = []
5c15c1a0 2896 playlist_contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist']['contents'], list)
2897 if playlist_contents:
5b0a6a80 2898 for item in playlist_contents:
5c15c1a0 2899 videoId = try_get(item, lambda x: x['playlistPanelVideoRenderer']['videoId'], compat_str)
2900 if videoId:
5b0a6a80 2901 ids.append(videoId)
2902 return ids
2903
652cdaa2 2904 def _extract_mix(self, playlist_id):
99209c29 2905 # The mixes are generated from a single video
652cdaa2 2906 # the id of the playlist is just 'RD' + video_id
1b6182d8 2907 ids = []
15f6397c 2908 yt_initial = None
1b6182d8
JMF
2909 last_id = playlist_id[-11:]
2910 for n in itertools.count(1):
07af16b9 2911 url = 'https://www.youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
1b6182d8
JMF
2912 webpage = self._download_webpage(
2913 url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))
2914 new_ids = orderedSet(re.findall(
2915 r'''(?xs)data-video-username=".*?".*?
2916 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
2917 webpage))
5b0a6a80 2918
2919 # if no ids in html of page, try using embedded json
2920 if (len(new_ids) == 0):
2921 yt_initial = self._get_yt_initial_data(playlist_id, webpage)
2922 if yt_initial:
2923 new_ids = self._extract_mix_ids_from_yt_initial(yt_initial)
2924
1b6182d8
JMF
2925 # Fetch new pages until all the videos are repeated, it seems that
2926 # there are always 51 unique videos.
2927 new_ids = [_id for _id in new_ids if _id not in ids]
2928 if not new_ids:
2929 break
2930 ids.extend(new_ids)
2931 last_id = ids[-1]
2932
2933 url_results = self._ids_to_results(ids)
2934
bc2f773b 2935 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5 2936 title_span = (
3089bc74
S
2937 search_title('playlist-title')
2938 or search_title('title long-title')
2939 or search_title('title'))
76d1700b 2940 title = clean_html(title_span)
652cdaa2 2941
15f6397c 2942 if not title:
2943 title = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist']['title'], compat_str)
2944
652cdaa2
JMF
2945 return self.playlist_result(url_results, playlist_id, title)
2946
448830ce 2947 def _extract_playlist(self, playlist_id):
dbb94fb0
S
2948 url = self._TEMPLATE_URL % playlist_id
2949 page = self._download_webpage(url, playlist_id)
dbb94fb0 2950
067aa17e 2951 # the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604)
8bc0800d 2952 for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page):
39b62db1
YCH
2953 match = match.strip()
2954 # Check if the playlist exists or is private
4201ba13
S
2955 mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match)
2956 if mobj:
2957 reason = mobj.group('reason')
2958 message = 'This playlist %s' % reason
2959 if 'private' in reason:
2960 message += ', use --username or --netrc to access it'
2961 message += '.'
2962 raise ExtractorError(message, expected=True)
39b62db1
YCH
2963 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
2964 raise ExtractorError(
2965 'Invalid parameters. Maybe URL is incorrect.',
2966 expected=True)
2967 elif re.match(r'[^<]*Choose your language[^<]*', match):
2968 continue
2969 else:
2970 self.report_warning('Youtube gives an alert message: ' + match)
10c0e2d8 2971
dbb94fb0 2972 playlist_title = self._html_search_regex(
63b4295d 2973 r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
dacb3a86 2974 page, 'title', default=None)
c5e8d7af 2975
07aeced6 2976 _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref='
4e3f1f04 2977 uploader = self._html_search_regex(
07aeced6
S
2978 r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,
2979 page, 'uploader', default=None)
2980 mobj = re.search(
2981 r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE,
2982 page)
2983 if mobj:
2984 uploader_id = mobj.group('uploader_id')
2985 uploader_url = compat_urlparse.urljoin(url, mobj.group('path'))
2986 else:
2987 uploader_id = uploader_url = None
2988
dacb3a86
S
2989 has_videos = True
2990
2991 if not playlist_title:
2992 try:
2993 # Some playlist URLs don't actually serve a playlist (e.g.
2994 # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)
2995 next(self._entries(page, playlist_id))
2996 except StopIteration:
2997 has_videos = False
2998
07aeced6 2999 playlist = self.playlist_result(
dacb3a86 3000 self._entries(page, playlist_id), playlist_id, playlist_title)
07aeced6
S
3001 playlist.update({
3002 'uploader': uploader,
3003 'uploader_id': uploader_id,
3004 'uploader_url': uploader_url,
3005 })
7f4f0b21 3006 if playlist_id.startswith(self._YTM_PLAYLIST_PREFIX):
3007 playlist.update(self._YTM_CHANNEL_INFO)
07aeced6
S
3008
3009 return has_videos, playlist
c5e8d7af 3010
ebf1b291 3011 def _check_download_just_video(self, url, playlist_id):
448830ce
S
3012 # Check if it's a video-specific URL
3013 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
481cc733 3014 video_id = query_dict.get('v', [None])[0] or self._search_regex(
87dadd45 3015 r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url,
481cc733
S
3016 'video id', default=None)
3017 if video_id:
448830ce
S
3018 if self._downloader.params.get('noplaylist'):
3019 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
dacb3a86 3020 return video_id, self.url_result(video_id, 'Youtube', video_id=video_id)
448830ce
S
3021 else:
3022 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
dacb3a86
S
3023 return video_id, None
3024 return None, None
448830ce 3025
ebf1b291
S
3026 def _real_extract(self, url):
3027 # Extract playlist id
3028 mobj = re.match(self._VALID_URL, url)
3029 if mobj is None:
3030 raise ExtractorError('Invalid URL: %s' % url)
3031 playlist_id = mobj.group(1) or mobj.group(2)
3032
dacb3a86 3033 video_id, video = self._check_download_just_video(url, playlist_id)
ebf1b291
S
3034 if video:
3035 return video
3036
466a6145 3037 if playlist_id.startswith(('RD', 'UL', 'PU')):
7f4f0b21 3038 if not playlist_id.startswith(self._YTM_PLAYLIST_PREFIX):
659ddd7f 3039 # Mixes require a custom extraction process,
3040 # Youtube Music playlists act like normal playlists (with randomized order)
3041 return self._extract_mix(playlist_id)
448830ce 3042
dacb3a86
S
3043 has_videos, playlist = self._extract_playlist(playlist_id)
3044 if has_videos or not video_id:
3045 return playlist
3046
3047 # Some playlist URLs don't actually serve a playlist (see
067aa17e 3048 # https://github.com/ytdl-org/youtube-dl/issues/10537).
dacb3a86
S
3049 # Fallback to plain video extraction if there is a video id
3050 # along with playlist id.
3051 return self.url_result(video_id, 'Youtube', video_id=video_id)
448830ce 3052
c5e8d7af 3053
648e6a1f 3054class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
78caa52a 3055 IE_DESC = 'YouTube.com channels'
66b48727 3056 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie|kids)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)'
eb0f3e7e 3057 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
648e6a1f 3058 _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
78caa52a 3059 IE_NAME = 'youtube:channel'
cdc628a4
PH
3060 _TESTS = [{
3061 'note': 'paginated channel',
3062 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
3063 'playlist_mincount': 91,
acf757f4 3064 'info_dict': {
9170ca5b
JMF
3065 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
3066 'title': 'Uploads from lex will',
13a75688
S
3067 'uploader': 'lex will',
3068 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
acf757f4 3069 }
5c43afd4
JMF
3070 }, {
3071 'note': 'Age restricted channel',
3072 # from https://www.youtube.com/user/DeusExOfficial
3073 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',
3074 'playlist_mincount': 64,
3075 'info_dict': {
3076 'id': 'UUs0ifCMCm1icqRbqhUINa0w',
3077 'title': 'Uploads from Deus Ex',
13a75688
S
3078 'uploader': 'Deus Ex',
3079 'uploader_id': 'DeusExOfficial',
5c43afd4 3080 },
cd5a74a2
S
3081 }, {
3082 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',
3083 'only_matching': True,
66b48727
RA
3084 }, {
3085 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA',
3086 'only_matching': True,
cdc628a4 3087 }]
c5e8d7af 3088
e462474e
S
3089 @classmethod
3090 def suitable(cls, url):
f07e276a
S
3091 return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)
3092 else super(YoutubeChannelIE, cls).suitable(url))
e462474e 3093
9558dcec
S
3094 def _build_template_url(self, url, channel_id):
3095 return self._TEMPLATE_URL % channel_id
3096
c5e8d7af 3097 def _real_extract(self, url):
9ff67727 3098 channel_id = self._match_id(url)
c5e8d7af 3099
9558dcec 3100 url = self._build_template_url(url, channel_id)
386bdfa6
S
3101
3102 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
3103 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
3104 # otherwise fallback on channel by page extraction
3105 channel_page = self._download_webpage(
3106 url + '?view=57', channel_id,
3107 'Downloading channel page', fatal=False)
2b3c2546
PH
3108 if channel_page is False:
3109 channel_playlist_id = False
3110 else:
3111 channel_playlist_id = self._html_search_meta(
3112 'channelId', channel_page, 'channel id', default=None)
3113 if not channel_playlist_id:
73c4ac2c
S
3114 channel_url = self._html_search_meta(
3115 ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),
3116 channel_page, 'channel url', default=None)
3117 if channel_url:
3118 channel_playlist_id = self._search_regex(
3119 r'vnd\.youtube://user/([0-9A-Za-z_-]+)',
3120 channel_url, 'channel id', default=None)
386bdfa6
S
3121 if channel_playlist_id and channel_playlist_id.startswith('UC'):
3122 playlist_id = 'UU' + channel_playlist_id[2:]
d2a9de78
IK
3123 return self.url_result(
3124 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
386bdfa6 3125
60bf45c8 3126 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
31812a9e
PH
3127 autogenerated = re.search(r'''(?x)
3128 class="[^"]*?(?:
3129 channel-header-autogenerated-label|
3130 yt-channel-title-autogenerated
3131 )[^"]*"''', channel_page) is not None
c5e8d7af 3132
b9643eed
JMF
3133 if autogenerated:
3134 # The videos are contained in a single page
3135 # the ajax pages can't be used, they are empty
b82f815f 3136 entries = [
fb69240c
S
3137 self.url_result(
3138 video_id, 'Youtube', video_id=video_id,
3139 video_title=video_title)
8f02ad4f 3140 for video_id, video_title in self.extract_videos_from_page(channel_page)]
b82f815f
PH
3141 return self.playlist_result(entries, channel_id)
3142
73c4ac2c
S
3143 try:
3144 next(self._entries(channel_page, channel_id))
3145 except StopIteration:
3146 alert_message = self._html_search_regex(
3147 r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',
3148 channel_page, 'alert', default=None, group='alert')
3149 if alert_message:
3150 raise ExtractorError('Youtube said: %s' % alert_message, expected=True)
3151
648e6a1f 3152 return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
c5e8d7af
PH
3153
3154
eb0f3e7e 3155class YoutubeUserIE(YoutubeChannelIE):
78caa52a 3156 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
39e7107d 3157 _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9%-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_%-]+)'
9558dcec 3158 _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
78caa52a 3159 IE_NAME = 'youtube:user'
c5e8d7af 3160
cdc628a4
PH
3161 _TESTS = [{
3162 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
3163 'playlist_mincount': 320,
3164 'info_dict': {
73c4ac2c
S
3165 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
3166 'title': 'Uploads from The Linux Foundation',
13a75688
S
3167 'uploader': 'The Linux Foundation',
3168 'uploader_id': 'TheLinuxFoundation',
cdc628a4 3169 }
9558dcec
S
3170 }, {
3171 # Only available via https://www.youtube.com/c/12minuteathlete/videos
3172 # but not https://www.youtube.com/user/12minuteathlete/videos
3173 'url': 'https://www.youtube.com/c/12minuteathlete/videos',
3174 'playlist_mincount': 249,
3175 'info_dict': {
3176 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
3177 'title': 'Uploads from 12 Minute Athlete',
13a75688
S
3178 'uploader': '12 Minute Athlete',
3179 'uploader_id': 'the12minuteathlete',
9558dcec 3180 }
cdc628a4
PH
3181 }, {
3182 'url': 'ytuser:phihag',
3183 'only_matching': True,
daa0df9e
YCH
3184 }, {
3185 'url': 'https://www.youtube.com/c/gametrailers',
3186 'only_matching': True,
39e7107d
U
3187 }, {
3188 'url': 'https://www.youtube.com/c/Pawe%C5%82Zadro%C5%BCniak',
3189 'only_matching': True,
9558dcec
S
3190 }, {
3191 'url': 'https://www.youtube.com/gametrailers',
3192 'only_matching': True,
73c4ac2c 3193 }, {
0e879f43 3194 # This channel is not available, geo restricted to JP
73c4ac2c
S
3195 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',
3196 'only_matching': True,
cdc628a4
PH
3197 }]
3198
e3ea4790 3199 @classmethod
f4b05232 3200 def suitable(cls, url):
e3ea4790
JMF
3201 # Don't return True if the url can be extracted with other youtube
3202 # extractor, the regex would is too permissive and it would match.
f3a58d46 3203 other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)
3204 if any(ie.suitable(url) for ie in other_yt_ies):
5f6a1245
JW
3205 return False
3206 else:
3207 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 3208
9558dcec
S
3209 def _build_template_url(self, url, channel_id):
3210 mobj = re.match(self._VALID_URL, url)
3211 return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))
3212
b05654f0 3213
f07e276a
S
3214class YoutubeLiveIE(YoutubeBaseInfoExtractor):
3215 IE_DESC = 'YouTube.com live streams'
073d5bf5 3216 _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'
f07e276a
S
3217 IE_NAME = 'youtube:live'
3218
3219 _TESTS = [{
2d3d2997 3220 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
f07e276a
S
3221 'info_dict': {
3222 'id': 'a48o2S1cPoo',
3223 'ext': 'mp4',
3224 'title': 'The Young Turks - Live Main Show',
3225 'uploader': 'The Young Turks',
3226 'uploader_id': 'TheYoungTurks',
ec85ded8 3227 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
f07e276a
S
3228 'upload_date': '20150715',
3229 'license': 'Standard YouTube License',
3230 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
3231 'categories': ['News & Politics'],
3232 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
3233 'like_count': int,
3234 'dislike_count': int,
3235 },
3236 'params': {
3237 'skip_download': True,
3238 },
3239 }, {
2d3d2997 3240 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
f07e276a 3241 'only_matching': True,
c1b2a085
S
3242 }, {
3243 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
3244 'only_matching': True,
073d5bf5
S
3245 }, {
3246 'url': 'https://www.youtube.com/TheYoungTurks/live',
3247 'only_matching': True,
f07e276a
S
3248 }]
3249
3250 def _real_extract(self, url):
3251 mobj = re.match(self._VALID_URL, url)
3252 channel_id = mobj.group('id')
3253 base_url = mobj.group('base_url')
3254 webpage = self._download_webpage(url, channel_id, fatal=False)
3255 if webpage:
3256 page_type = self._og_search_property(
e7f3529f 3257 'type', webpage, 'page type', default='')
f07e276a
S
3258 video_id = self._html_search_meta(
3259 'videoId', webpage, 'video id', default=None)
e7f3529f
S
3260 if page_type.startswith('video') and video_id and re.match(
3261 r'^[0-9A-Za-z_-]{11}$', video_id):
f07e276a
S
3262 return self.url_result(video_id, YoutubeIE.ie_key())
3263 return self.url_result(base_url)
3264
3265
e462474e
S
3266class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
3267 IE_DESC = 'YouTube.com user/channel playlists'
e942cfd1 3268 _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel|c)/(?P<id>[^/]+)/playlists'
e462474e 3269 IE_NAME = 'youtube:playlists'
0c148415 3270
e568c223 3271 _TESTS = [{
2d3d2997 3272 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
0c148415
S
3273 'playlist_mincount': 4,
3274 'info_dict': {
3275 'id': 'ThirstForScience',
13a75688 3276 'title': 'ThirstForScience',
0c148415 3277 },
e568c223
S
3278 }, {
3279 # with "Load more" button
2d3d2997 3280 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
e568c223
S
3281 'playlist_mincount': 70,
3282 'info_dict': {
3283 'id': 'igorkle1',
3284 'title': 'Игорь Клейнер',
3285 },
e462474e
S
3286 }, {
3287 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',
3288 'playlist_mincount': 17,
3289 'info_dict': {
3290 'id': 'UCiU1dHvZObB2iP6xkJ__Icw',
3291 'title': 'Chem Player',
3292 },
13a75688 3293 'skip': 'Blocked',
e942cfd1
S
3294 }, {
3295 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
3296 'only_matching': True,
e568c223 3297 }]
0c148415
S
3298
3299
9833e7a0 3300class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistBaseInfoExtractor):
78caa52a 3301 IE_DESC = 'YouTube.com searches'
b4c08069
JMF
3302 # there doesn't appear to be a real limit, for example if you search for
3303 # 'python' you get more than 8.000.000 results
3304 _MAX_RESULTS = float('inf')
78caa52a 3305 IE_NAME = 'youtube:search'
b05654f0 3306 _SEARCH_KEY = 'ytsearch'
6c894ea1 3307 _SEARCH_PARAMS = None
9dd8e46a 3308 _TESTS = []
b05654f0 3309
6c894ea1
U
3310 def _entries(self, query, n):
3311 data = {
3312 'context': {
3313 'client': {
3314 'clientName': 'WEB',
3315 'clientVersion': '2.20201021.03.00',
3316 }
3317 },
3318 'query': query,
a22b2fd1 3319 }
6c894ea1
U
3320 if self._SEARCH_PARAMS:
3321 data['params'] = self._SEARCH_PARAMS
3322 total = 0
3323 for page_num in itertools.count(1):
3324 search = self._download_json(
3325 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
3326 video_id='query "%s"' % query,
3327 note='Downloading page %s' % page_num,
3328 errnote='Unable to download API page', fatal=False,
3329 data=json.dumps(data).encode('utf8'),
3330 headers={'content-type': 'application/json'})
3331 if not search:
b4c08069 3332 break
6c894ea1
U
3333 slr_contents = try_get(
3334 search,
3335 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
3336 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
3337 list)
3338 if not slr_contents:
a22b2fd1 3339 break
6c894ea1
U
3340 isr_contents = try_get(
3341 slr_contents,
3342 lambda x: x[0]['itemSectionRenderer']['contents'],
3343 list)
3344 if not isr_contents:
3345 break
3346 for content in isr_contents:
3347 if not isinstance(content, dict):
3348 continue
3349 video = content.get('videoRenderer')
3350 if not isinstance(video, dict):
3351 continue
3352 video_id = video.get('videoId')
3353 if not video_id:
3354 continue
3355 title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str)
3356 description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str)
3357 duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str))
3358 view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
3359 view_count = int_or_none(self._search_regex(
3360 r'^(\d+)', re.sub(r'\s', '', view_count_text),
3361 'view count', default=None))
3362 uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
3363 total += 1
3364 yield {
3365 '_type': 'url_transparent',
3366 'ie_key': YoutubeIE.ie_key(),
3367 'id': video_id,
3368 'url': video_id,
3369 'title': title,
3370 'description': description,
3371 'duration': duration,
3372 'view_count': view_count,
3373 'uploader': uploader,
3374 }
3375 if total == n:
3376 return
3377 token = try_get(
3378 slr_contents,
3379 lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
3380 compat_str)
3381 if not token:
3382 break
3383 data['continuation'] = token
b05654f0 3384
6c894ea1
U
3385 def _get_n_results(self, query, n):
3386 """Get a specified number of results for a query"""
3387 return self.playlist_result(self._entries(query, n), query)
75dff0ee 3388
c9ae7b95 3389
a3dd9248 3390class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 3391 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248 3392 _SEARCH_KEY = 'ytsearchdate'
78caa52a 3393 IE_DESC = 'YouTube.com searches, newest videos first'
6c894ea1 3394 _SEARCH_PARAMS = 'CAI%3D'
75dff0ee 3395
c9ae7b95 3396
9833e7a0 3397class YoutubeSearchURLIE(YoutubePlaylistBaseInfoExtractor):
78caa52a
PH
3398 IE_DESC = 'YouTube.com search URLs'
3399 IE_NAME = 'youtube:search_url'
d2c1f79f 3400 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4 3401 _TESTS = [{
3867038a 3402 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
cdc628a4
PH
3403 'playlist_mincount': 5,
3404 'info_dict': {
3867038a 3405 'title': 'youtube-dl test video',
cdc628a4 3406 }
d2c1f79f
S
3407 }, {
3408 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
3409 'only_matching': True,
cdc628a4 3410 }]
c9ae7b95 3411
9833e7a0
LR
3412 def _process_json_dict(self, obj, videos, c):
3413 if "videoId" in obj:
3414 videos.append(obj)
3415 return
e03b4f3e 3416
9833e7a0
LR
3417 if "nextContinuationData" in obj:
3418 c["continuation"] = obj["nextContinuationData"]
3419 return
e03b4f3e 3420
19f671f8 3421 def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page):
3422 search_response = self._parse_json(self._search_regex(self._SEARCH_DATA, page, 'ytInitialData'), None)
3423
e03b4f3e 3424 result_items = self._find_videos_in_json(search_response)
19f671f8 3425
955c4cb6 3426 for renderer in result_items:
3427 video_id = try_get(renderer, lambda x: x['videoId'])
3428 video_title = try_get(renderer, lambda x: x['title']['runs'][0]['text']) or try_get(renderer, lambda x: x['title']['simpleText'])
19f671f8 3429
3430 if video_id is None or video_title is None:
955c4cb6 3431 # we do not have a videoRenderer or title extraction broke
19f671f8 3432 continue
3433
3434 video_title = video_title.strip()
3435
3436 try:
3437 idx = ids_in_page.index(video_id)
3438 if video_title and not titles_in_page[idx]:
3439 titles_in_page[idx] = video_title
3440 except ValueError:
3441 ids_in_page.append(video_id)
3442 titles_in_page.append(video_title)
3443
3444 def extract_videos_from_page(self, page):
3445 ids_in_page = []
3446 titles_in_page = []
3447 self.extract_videos_from_page_impl(page, ids_in_page, titles_in_page)
3448 return zip(ids_in_page, titles_in_page)
3449
c9ae7b95
PH
3450 def _real_extract(self, url):
3451 mobj = re.match(self._VALID_URL, url)
7fd002c0 3452 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
c9ae7b95 3453 webpage = self._download_webpage(url, query)
0f8566e9
U
3454 # data_json = self._process_initial_data(webpage)
3455 return self.playlist_result(self._entries(webpage, query, n=5), playlist_title=query)
c9ae7b95
PH
3456
3457
136dadde 3458class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
78caa52a 3459 IE_DESC = 'YouTube.com (multi-season) shows'
92519402 3460 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 3461 IE_NAME = 'youtube:show'
cdc628a4 3462 _TESTS = [{
4003bd82 3463 'url': 'https://www.youtube.com/show/airdisasters',
8801255d 3464 'playlist_mincount': 5,
cdc628a4
PH
3465 'info_dict': {
3466 'id': 'airdisasters',
3467 'title': 'Air Disasters',
3468 }
3469 }]
75dff0ee
JMF
3470
3471 def _real_extract(self, url):
136dadde
S
3472 playlist_id = self._match_id(url)
3473 return super(YoutubeShowIE, self)._real_extract(
3474 'https://www.youtube.com/show/%s/playlists' % playlist_id)
04cc9617
JMF
3475
3476
9833e7a0 3477class YoutubeFeedsInfoExtractor(YoutubePlaylistBaseInfoExtractor):
d7ae0639 3478 """
25f14e9f 3479 Base class for feed extractors
d7ae0639
JMF
3480 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
3481 """
b2e8bc1b 3482 _LOGIN_REQUIRED = True
d7ae0639
JMF
3483
3484 @property
3485 def IE_NAME(self):
78caa52a 3486 return 'youtube:%s' % self._FEED_NAME
04cc9617 3487
81f0259b 3488 def _real_initialize(self):
b2e8bc1b 3489 self._login()
81f0259b 3490
9833e7a0
LR
3491 def _process_entries(self, entries, seen):
3492 new_info = []
3493 for v in entries:
3494 v_id = try_get(v, lambda x: x['videoId'])
3495 if not v_id:
3496 continue
62c95fd5 3497
9833e7a0
LR
3498 have_video = False
3499 for old in seen:
3500 if old['videoId'] == v_id:
3501 have_video = True
3502 break
2bc43303 3503
9833e7a0
LR
3504 if not have_video:
3505 new_info.append(v)
3853309f 3506
9833e7a0
LR
3507 if not new_info:
3508 return
2bc43303 3509
9833e7a0
LR
3510 seen.extend(new_info)
3511 for video in new_info:
3512 yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=self._extract_title(video))
2bc43303 3513
3853309f
S
3514 def _real_extract(self, url):
3515 page = self._download_webpage(
3516 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
3517 self._PLAYLIST_TITLE)
9833e7a0
LR
3518 return self.playlist_result(self._entries(page, self._PLAYLIST_TITLE),
3519 playlist_title=self._PLAYLIST_TITLE)
25f14e9f
S
3520
3521
3522class YoutubeWatchLaterIE(YoutubePlaylistIE):
3523 IE_NAME = 'youtube:watchlater'
3524 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
92519402 3525 _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'
25f14e9f 3526
bc7a9cd8
S
3527 _TESTS = [{
3528 'url': 'https://www.youtube.com/playlist?list=WL',
3529 'only_matching': True,
3530 }, {
3531 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',
3532 'only_matching': True,
3533 }]
25f14e9f
S
3534
3535 def _real_extract(self, url):
7e5dc339 3536 _, video = self._check_download_just_video(url, 'WL')
ebf1b291
S
3537 if video:
3538 return video
dacb3a86
S
3539 _, playlist = self._extract_playlist('WL')
3540 return playlist
f459d170 3541
5f6a1245 3542
c626a3d9 3543class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 3544 IE_NAME = 'youtube:favorites'
f3a34072 3545 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
92519402 3546 _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
3547 _LOGIN_REQUIRED = True
3548
3549 def _real_extract(self, url):
3550 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 3551 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 3552 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
3553
3554
25f14e9f
S
3555class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3556 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
92519402 3557 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'
25f14e9f
S
3558 _FEED_NAME = 'recommended'
3559 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1ed5b5c9 3560
1ed5b5c9 3561
25f14e9f
S
3562class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
3563 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
92519402 3564 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
25f14e9f
S
3565 _FEED_NAME = 'subscriptions'
3566 _PLAYLIST_TITLE = 'Youtube Subscriptions'
1ed5b5c9 3567
1ed5b5c9 3568
25f14e9f
S
3569class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
3570 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
92519402 3571 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
25f14e9f
S
3572 _FEED_NAME = 'history'
3573 _PLAYLIST_TITLE = 'Youtube History'
1ed5b5c9
JMF
3574
3575
15870e90
PH
3576class YoutubeTruncatedURLIE(InfoExtractor):
3577 IE_NAME = 'youtube:truncated_url'
3578 IE_DESC = False # Do not list
975d35db 3579 _VALID_URL = r'''(?x)
b95aab84
PH
3580 (?:https?://)?
3581 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3582 (?:watch\?(?:
c4808c60 3583 feature=[a-z_]+|
b95aab84
PH
3584 annotation_id=annotation_[^&]+|
3585 x-yt-cl=[0-9]+|
c1708b89 3586 hl=[^&]*|
287be8c6 3587 t=[0-9]+
b95aab84
PH
3588 )?
3589 |
3590 attribution_link\?a=[^&]+
3591 )
3592 $
975d35db 3593 '''
15870e90 3594
c4808c60 3595 _TESTS = [{
2d3d2997 3596 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
c4808c60 3597 'only_matching': True,
dc2fc736 3598 }, {
2d3d2997 3599 'url': 'https://www.youtube.com/watch?',
dc2fc736 3600 'only_matching': True,
b95aab84
PH
3601 }, {
3602 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3603 'only_matching': True,
3604 }, {
3605 'url': 'https://www.youtube.com/watch?feature=foo',
3606 'only_matching': True,
c1708b89
PH
3607 }, {
3608 'url': 'https://www.youtube.com/watch?hl=en-GB',
3609 'only_matching': True,
287be8c6
PH
3610 }, {
3611 'url': 'https://www.youtube.com/watch?t=2372',
3612 'only_matching': True,
c4808c60
PH
3613 }]
3614
15870e90
PH
3615 def _real_extract(self, url):
3616 raise ExtractorError(
78caa52a
PH
3617 'Did you forget to quote the URL? Remember that & is a meta '
3618 'character in most shells, so you want to put the URL in quotes, '
3867038a 3619 'like youtube-dl '
2d3d2997 3620 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3867038a 3621 ' or simply youtube-dl BaW_jenozKc .',
15870e90 3622 expected=True)
772fd5cc
PH
3623
3624
3625class YoutubeTruncatedIDIE(InfoExtractor):
3626 IE_NAME = 'youtube:truncated_id'
3627 IE_DESC = False # Do not list
b95aab84 3628 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
3629
3630 _TESTS = [{
3631 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3632 'only_matching': True,
3633 }]
3634
3635 def _real_extract(self, url):
3636 video_id = self._match_id(url)
3637 raise ExtractorError(
3638 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3639 expected=True)