]> jfr.im git - yt-dlp.git/blob - youtube_dlc/extractor/youtube.py
Merge pull request #74 from blackjack4494/master
[yt-dlp.git] / youtube_dlc / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5
6 import itertools
7 import json
8 import os.path
9 import random
10 import re
11 import time
12 import traceback
13
14 from .common import InfoExtractor, SearchInfoExtractor
15 from ..jsinterp import JSInterpreter
16 from ..swfinterp import SWFInterpreter
17 from ..compat import (
18 compat_chr,
19 compat_HTTPError,
20 compat_kwargs,
21 compat_parse_qs,
22 compat_urllib_parse_unquote,
23 compat_urllib_parse_unquote_plus,
24 compat_urllib_parse_urlencode,
25 compat_urllib_parse_urlparse,
26 compat_urlparse,
27 compat_str,
28 )
29 from ..utils import (
30 bool_or_none,
31 clean_html,
32 error_to_compat_str,
33 extract_attributes,
34 ExtractorError,
35 float_or_none,
36 get_element_by_attribute,
37 get_element_by_id,
38 int_or_none,
39 mimetype2ext,
40 orderedSet,
41 parse_codecs,
42 parse_count,
43 parse_duration,
44 remove_quotes,
45 remove_start,
46 smuggle_url,
47 str_or_none,
48 str_to_int,
49 try_get,
50 unescapeHTML,
51 unified_strdate,
52 unsmuggle_url,
53 uppercase_escape,
54 url_or_none,
55 urlencode_postdata,
56 )
57
58
59 class YoutubeBaseInfoExtractor(InfoExtractor):
60 """Provide base functions for Youtube extractors"""
61 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
62 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
63
64 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
65 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
66 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
67
68 _NETRC_MACHINE = 'youtube'
69 # If True it will raise an error if no login info is provided
70 _LOGIN_REQUIRED = False
71
72 _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}'
73
74 _YOUTUBE_CLIENT_HEADERS = {
75 'x-youtube-client-name': '1',
76 'x-youtube-client-version': '1.20200609.04.02',
77 }
78
79 def _set_language(self):
80 self._set_cookie(
81 '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
82 # YouTube sets the expire time to about two months
83 expire_time=time.time() + 2 * 30 * 24 * 3600)
84
85 def _ids_to_results(self, ids):
86 return [
87 self.url_result(vid_id, 'Youtube', video_id=vid_id)
88 for vid_id in ids]
89
90 def _login(self):
91 """
92 Attempt to log in to YouTube.
93 True is returned if successful or skipped.
94 False is returned if login failed.
95
96 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
97 """
98 username, password = self._get_login_info()
99 # No authentication to be performed
100 if username is None:
101 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
102 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
103 if self._downloader.params.get('cookiefile') and False: # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
104 self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
105 return True
106
107 login_page = self._download_webpage(
108 self._LOGIN_URL, None,
109 note='Downloading login page',
110 errnote='unable to fetch login page', fatal=False)
111 if login_page is False:
112 return
113
114 login_form = self._hidden_inputs(login_page)
115
116 def req(url, f_req, note, errnote):
117 data = login_form.copy()
118 data.update({
119 'pstMsg': 1,
120 'checkConnection': 'youtube',
121 'checkedDomains': 'youtube',
122 'hl': 'en',
123 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
124 'f.req': json.dumps(f_req),
125 'flowName': 'GlifWebSignIn',
126 'flowEntry': 'ServiceLogin',
127 # TODO: reverse actual botguard identifier generation algo
128 'bgRequest': '["identifier",""]',
129 })
130 return self._download_json(
131 url, None, note=note, errnote=errnote,
132 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
133 fatal=False,
134 data=urlencode_postdata(data), headers={
135 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
136 'Google-Accounts-XSRF': 1,
137 })
138
139 def warn(message):
140 self._downloader.report_warning(message)
141
142 lookup_req = [
143 username,
144 None, [], None, 'US', None, None, 2, False, True,
145 [
146 None, None,
147 [2, 1, None, 1,
148 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
149 None, [], 4],
150 1, [None, None, []], None, None, None, True
151 ],
152 username,
153 ]
154
155 lookup_results = req(
156 self._LOOKUP_URL, lookup_req,
157 'Looking up account info', 'Unable to look up account info')
158
159 if lookup_results is False:
160 return False
161
162 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
163 if not user_hash:
164 warn('Unable to extract user hash')
165 return False
166
167 challenge_req = [
168 user_hash,
169 None, 1, None, [1, None, None, None, [password, None, True]],
170 [
171 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
172 1, [None, None, []], None, None, None, True
173 ]]
174
175 challenge_results = req(
176 self._CHALLENGE_URL, challenge_req,
177 'Logging in', 'Unable to log in')
178
179 if challenge_results is False:
180 return
181
182 login_res = try_get(challenge_results, lambda x: x[0][5], list)
183 if login_res:
184 login_msg = try_get(login_res, lambda x: x[5], compat_str)
185 warn(
186 'Unable to login: %s' % 'Invalid password'
187 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
188 return False
189
190 res = try_get(challenge_results, lambda x: x[0][-1], list)
191 if not res:
192 warn('Unable to extract result entry')
193 return False
194
195 login_challenge = try_get(res, lambda x: x[0][0], list)
196 if login_challenge:
197 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
198 if challenge_str == 'TWO_STEP_VERIFICATION':
199 # SEND_SUCCESS - TFA code has been successfully sent to phone
200 # QUOTA_EXCEEDED - reached the limit of TFA codes
201 status = try_get(login_challenge, lambda x: x[5], compat_str)
202 if status == 'QUOTA_EXCEEDED':
203 warn('Exceeded the limit of TFA codes, try later')
204 return False
205
206 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
207 if not tl:
208 warn('Unable to extract TL')
209 return False
210
211 tfa_code = self._get_tfa_info('2-step verification code')
212
213 if not tfa_code:
214 warn(
215 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
216 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
217 return False
218
219 tfa_code = remove_start(tfa_code, 'G-')
220
221 tfa_req = [
222 user_hash, None, 2, None,
223 [
224 9, None, None, None, None, None, None, None,
225 [None, tfa_code, True, 2]
226 ]]
227
228 tfa_results = req(
229 self._TFA_URL.format(tl), tfa_req,
230 'Submitting TFA code', 'Unable to submit TFA code')
231
232 if tfa_results is False:
233 return False
234
235 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
236 if tfa_res:
237 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
238 warn(
239 'Unable to finish TFA: %s' % 'Invalid TFA code'
240 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
241 return False
242
243 check_cookie_url = try_get(
244 tfa_results, lambda x: x[0][-1][2], compat_str)
245 else:
246 CHALLENGES = {
247 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
248 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
249 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
250 }
251 challenge = CHALLENGES.get(
252 challenge_str,
253 '%s returned error %s.' % (self.IE_NAME, challenge_str))
254 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
255 return False
256 else:
257 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
258
259 if not check_cookie_url:
260 warn('Unable to extract CheckCookie URL')
261 return False
262
263 check_cookie_results = self._download_webpage(
264 check_cookie_url, None, 'Checking cookie', fatal=False)
265
266 if check_cookie_results is False:
267 return False
268
269 if 'https://myaccount.google.com/' not in check_cookie_results:
270 warn('Unable to log in')
271 return False
272
273 return True
274
275 def _download_webpage_handle(self, *args, **kwargs):
276 query = kwargs.get('query', {}).copy()
277 query['disable_polymer'] = 'true'
278 kwargs['query'] = query
279 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
280 *args, **compat_kwargs(kwargs))
281
282 def _real_initialize(self):
283 if self._downloader is None:
284 return
285 self._set_language()
286 if not self._login():
287 return
288
289
290 class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
291 # Extract entries from page with "Load more" button
292 def _entries(self, page, playlist_id):
293 more_widget_html = content_html = page
294 for page_num in itertools.count(1):
295 for entry in self._process_page(content_html):
296 yield entry
297
298 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
299 if not mobj:
300 break
301
302 count = 0
303 retries = 3
304 while count <= retries:
305 try:
306 # Downloading page may result in intermittent 5xx HTTP error
307 # that is usually worked around with a retry
308 more = self._download_json(
309 'https://www.youtube.com/%s' % mobj.group('more'), playlist_id,
310 'Downloading page #%s%s'
311 % (page_num, ' (retry #%d)' % count if count else ''),
312 transform_source=uppercase_escape,
313 headers=self._YOUTUBE_CLIENT_HEADERS)
314 break
315 except ExtractorError as e:
316 if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
317 count += 1
318 if count <= retries:
319 continue
320 raise
321
322 content_html = more['content_html']
323 if not content_html.strip():
324 # Some webpages show a "Load more" button but they don't
325 # have more videos
326 break
327 more_widget_html = more['load_more_widget_html']
328
329
330 class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
331 def _process_page(self, content):
332 for video_id, video_title in self.extract_videos_from_page(content):
333 yield self.url_result(video_id, 'Youtube', video_id, video_title)
334
335 def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page):
336 for mobj in re.finditer(video_re, page):
337 # The link with index 0 is not the first video of the playlist (not sure if still actual)
338 if 'index' in mobj.groupdict() and mobj.group('id') == '0':
339 continue
340 video_id = mobj.group('id')
341 video_title = unescapeHTML(
342 mobj.group('title')) if 'title' in mobj.groupdict() else None
343 if video_title:
344 video_title = video_title.strip()
345 if video_title == '► Play all':
346 video_title = None
347 try:
348 idx = ids_in_page.index(video_id)
349 if video_title and not titles_in_page[idx]:
350 titles_in_page[idx] = video_title
351 except ValueError:
352 ids_in_page.append(video_id)
353 titles_in_page.append(video_title)
354
355 def extract_videos_from_page(self, page):
356 ids_in_page = []
357 titles_in_page = []
358 self.extract_videos_from_page_impl(
359 self._VIDEO_RE, page, ids_in_page, titles_in_page)
360 return zip(ids_in_page, titles_in_page)
361
362
363 class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
364 def _process_page(self, content):
365 for playlist_id in orderedSet(re.findall(
366 r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',
367 content)):
368 yield self.url_result(
369 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
370
371 def _real_extract(self, url):
372 playlist_id = self._match_id(url)
373 webpage = self._download_webpage(url, playlist_id)
374 title = self._og_search_title(webpage, fatal=False)
375 return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
376
377
378 class YoutubeIE(YoutubeBaseInfoExtractor):
379 IE_DESC = 'YouTube.com'
380 _VALID_URL = r"""(?x)^
381 (
382 (?:https?://|//) # http(s):// or protocol-independent URL
383 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
384 (?:www\.)?deturl\.com/www\.youtube\.com/|
385 (?:www\.)?pwnyoutube\.com/|
386 (?:www\.)?hooktube\.com/|
387 (?:www\.)?yourepeat\.com/|
388 tube\.majestyc\.net/|
389 # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
390 (?:(?:www|dev)\.)?invidio\.us/|
391 (?:(?:www|no)\.)?invidiou\.sh/|
392 (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/|
393 (?:www\.)?invidious\.kabi\.tk/|
394 (?:www\.)?invidious\.13ad\.de/|
395 (?:www\.)?invidious\.mastodon\.host/|
396 (?:www\.)?invidious\.nixnet\.xyz/|
397 (?:www\.)?invidious\.drycat\.fr/|
398 (?:www\.)?tube\.poal\.co/|
399 (?:www\.)?vid\.wxzm\.sx/|
400 (?:www\.)?yewtu\.be/|
401 (?:www\.)?yt\.elukerio\.org/|
402 (?:www\.)?yt\.lelux\.fi/|
403 (?:www\.)?invidious\.ggc-project\.de/|
404 (?:www\.)?yt\.maisputain\.ovh/|
405 (?:www\.)?invidious\.13ad\.de/|
406 (?:www\.)?invidious\.toot\.koeln/|
407 (?:www\.)?invidious\.fdn\.fr/|
408 (?:www\.)?watch\.nettohikari\.com/|
409 (?:www\.)?kgg2m7yk5aybusll\.onion/|
410 (?:www\.)?qklhadlycap4cnod\.onion/|
411 (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
412 (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
413 (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
414 (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
415 (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
416 (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
417 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
418 (?:.*?\#/)? # handle anchor (#/) redirect urls
419 (?: # the various things that can precede the ID:
420 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
421 |(?: # or the v= param in all its forms
422 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
423 (?:\?|\#!?) # the params delimiter ? or # or #!
424 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
425 v=
426 )
427 ))
428 |(?:
429 youtu\.be| # just youtu.be/xxxx
430 vid\.plus| # or vid.plus/xxxx
431 zwearz\.com/watch| # or zwearz.com/watch/xxxx
432 )/
433 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
434 )
435 )? # all until now is optional -> you can pass the naked ID
436 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
437 (?!.*?\blist=
438 (?:
439 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
440 WL # WL are handled by the watch later IE
441 )
442 )
443 (?(1).+)? # if we found the ID, everything can follow
444 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
445 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
446 _PLAYER_INFO_RE = (
447 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P<ext>[a-z]+)$',
448 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.(?P<ext>[a-z]+)$',
449 )
450 _formats = {
451 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
452 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
453 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
454 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
455 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
456 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
457 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
458 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
459 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
460 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
461 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
462 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
463 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
464 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
465 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
466 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
467 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
468 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
469
470
471 # 3D videos
472 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
473 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
474 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
475 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
476 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
477 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
478 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
479
480 # Apple HTTP Live Streaming
481 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
482 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
483 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
484 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
485 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
486 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
487 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
488 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
489
490 # DASH mp4 video
491 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
492 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
493 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
494 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
495 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
496 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
497 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
498 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
499 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
500 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
501 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
502 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
503
504 # Dash mp4 audio
505 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
506 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
507 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
508 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
509 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
510 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
511 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
512
513 # Dash webm
514 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
515 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
516 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
517 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
518 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
519 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
520 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
521 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
522 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
523 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
524 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
525 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
526 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
527 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
528 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
529 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
530 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
531 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
532 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
533 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
534 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
535 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
536
537 # Dash webm audio
538 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
539 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
540
541 # Dash webm audio with opus inside
542 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
543 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
544 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
545
546 # RTMP (unnamed)
547 '_rtmp': {'protocol': 'rtmp'},
548
549 # av01 video only formats sometimes served with "unknown" codecs
550 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
551 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
552 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
553 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
554 }
555 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
556
557 _GEO_BYPASS = False
558
559 IE_NAME = 'youtube'
560 _TESTS = [
561 {
562 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
563 'info_dict': {
564 'id': 'BaW_jenozKc',
565 'ext': 'mp4',
566 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
567 'uploader': 'Philipp Hagemeister',
568 'uploader_id': 'phihag',
569 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
570 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
571 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
572 'upload_date': '20121002',
573 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
574 'categories': ['Science & Technology'],
575 'tags': ['youtube-dl'],
576 'duration': 10,
577 'view_count': int,
578 'like_count': int,
579 'dislike_count': int,
580 'start_time': 1,
581 'end_time': 9,
582 }
583 },
584 {
585 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
586 'note': 'Embed-only video (#1746)',
587 'info_dict': {
588 'id': 'yZIXLfi8CZQ',
589 'ext': 'mp4',
590 'upload_date': '20120608',
591 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
592 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
593 'uploader': 'SET India',
594 'uploader_id': 'setindia',
595 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
596 'age_limit': 18,
597 }
598 },
599 {
600 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
601 'note': 'Use the first video ID in the URL',
602 'info_dict': {
603 'id': 'BaW_jenozKc',
604 'ext': 'mp4',
605 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
606 'uploader': 'Philipp Hagemeister',
607 'uploader_id': 'phihag',
608 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
609 'upload_date': '20121002',
610 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
611 'categories': ['Science & Technology'],
612 'tags': ['youtube-dl'],
613 'duration': 10,
614 'view_count': int,
615 'like_count': int,
616 'dislike_count': int,
617 },
618 'params': {
619 'skip_download': True,
620 },
621 },
622 {
623 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
624 'note': '256k DASH audio (format 141) via DASH manifest',
625 'info_dict': {
626 'id': 'a9LDPn-MO4I',
627 'ext': 'm4a',
628 'upload_date': '20121002',
629 'uploader_id': '8KVIDEO',
630 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
631 'description': '',
632 'uploader': '8KVIDEO',
633 'title': 'UHDTV TEST 8K VIDEO.mp4'
634 },
635 'params': {
636 'youtube_include_dash_manifest': True,
637 'format': '141',
638 },
639 'skip': 'format 141 not served anymore',
640 },
641 # Controversy video
642 {
643 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
644 'info_dict': {
645 'id': 'T4XJQO3qol8',
646 'ext': 'mp4',
647 'duration': 219,
648 'upload_date': '20100909',
649 'uploader': 'Amazing Atheist',
650 'uploader_id': 'TheAmazingAtheist',
651 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
652 'title': 'Burning Everyone\'s Koran',
653 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
654 }
655 },
656 # Normal age-gate video (embed allowed)
657 {
658 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
659 'info_dict': {
660 'id': 'HtVdAasjOgU',
661 'ext': 'mp4',
662 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
663 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
664 'duration': 142,
665 'uploader': 'The Witcher',
666 'uploader_id': 'WitcherGame',
667 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
668 'upload_date': '20140605',
669 'age_limit': 18,
670 },
671 },
672 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
673 {
674 'url': 'lqQg6PlCWgI',
675 'info_dict': {
676 'id': 'lqQg6PlCWgI',
677 'ext': 'mp4',
678 'duration': 6085,
679 'upload_date': '20150827',
680 'uploader_id': 'olympic',
681 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
682 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
683 'uploader': 'Olympic',
684 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
685 },
686 'params': {
687 'skip_download': 'requires avconv',
688 }
689 },
690 # Non-square pixels
691 {
692 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
693 'info_dict': {
694 'id': '_b-2C3KPAM0',
695 'ext': 'mp4',
696 'stretched_ratio': 16 / 9.,
697 'duration': 85,
698 'upload_date': '20110310',
699 'uploader_id': 'AllenMeow',
700 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
701 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
702 'uploader': '孫ᄋᄅ',
703 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
704 },
705 },
706 # url_encoded_fmt_stream_map is empty string
707 {
708 'url': 'qEJwOuvDf7I',
709 'info_dict': {
710 'id': 'qEJwOuvDf7I',
711 'ext': 'webm',
712 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
713 'description': '',
714 'upload_date': '20150404',
715 'uploader_id': 'spbelect',
716 'uploader': 'Наблюдатели Петербурга',
717 },
718 'params': {
719 'skip_download': 'requires avconv',
720 },
721 'skip': 'This live event has ended.',
722 },
723 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
724 {
725 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
726 'info_dict': {
727 'id': 'FIl7x6_3R5Y',
728 'ext': 'webm',
729 'title': 'md5:7b81415841e02ecd4313668cde88737a',
730 'description': 'md5:116377fd2963b81ec4ce64b542173306',
731 'duration': 220,
732 'upload_date': '20150625',
733 'uploader_id': 'dorappi2000',
734 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
735 'uploader': 'dorappi2000',
736 'formats': 'mincount:31',
737 },
738 'skip': 'not actual anymore',
739 },
740 # DASH manifest with segment_list
741 {
742 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
743 'md5': '8ce563a1d667b599d21064e982ab9e31',
744 'info_dict': {
745 'id': 'CsmdDsKjzN8',
746 'ext': 'mp4',
747 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
748 'uploader': 'Airtek',
749 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
750 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
751 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
752 },
753 'params': {
754 'youtube_include_dash_manifest': True,
755 'format': '135', # bestvideo
756 },
757 'skip': 'This live event has ended.',
758 },
759 {
760 # Multifeed videos (multiple cameras), URL is for Main Camera
761 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
762 'info_dict': {
763 'id': 'jqWvoWXjCVs',
764 'title': 'teamPGP: Rocket League Noob Stream',
765 'description': 'md5:dc7872fb300e143831327f1bae3af010',
766 },
767 'playlist': [{
768 'info_dict': {
769 'id': 'jqWvoWXjCVs',
770 'ext': 'mp4',
771 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
772 'description': 'md5:dc7872fb300e143831327f1bae3af010',
773 'duration': 7335,
774 'upload_date': '20150721',
775 'uploader': 'Beer Games Beer',
776 'uploader_id': 'beergamesbeer',
777 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
778 'license': 'Standard YouTube License',
779 },
780 }, {
781 'info_dict': {
782 'id': '6h8e8xoXJzg',
783 'ext': 'mp4',
784 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
785 'description': 'md5:dc7872fb300e143831327f1bae3af010',
786 'duration': 7337,
787 'upload_date': '20150721',
788 'uploader': 'Beer Games Beer',
789 'uploader_id': 'beergamesbeer',
790 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
791 'license': 'Standard YouTube License',
792 },
793 }, {
794 'info_dict': {
795 'id': 'PUOgX5z9xZw',
796 'ext': 'mp4',
797 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
798 'description': 'md5:dc7872fb300e143831327f1bae3af010',
799 'duration': 7337,
800 'upload_date': '20150721',
801 'uploader': 'Beer Games Beer',
802 'uploader_id': 'beergamesbeer',
803 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
804 'license': 'Standard YouTube License',
805 },
806 }, {
807 'info_dict': {
808 'id': 'teuwxikvS5k',
809 'ext': 'mp4',
810 'title': 'teamPGP: Rocket League Noob Stream (zim)',
811 'description': 'md5:dc7872fb300e143831327f1bae3af010',
812 'duration': 7334,
813 'upload_date': '20150721',
814 'uploader': 'Beer Games Beer',
815 'uploader_id': 'beergamesbeer',
816 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
817 'license': 'Standard YouTube License',
818 },
819 }],
820 'params': {
821 'skip_download': True,
822 },
823 'skip': 'This video is not available.',
824 },
825 {
826 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
827 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
828 'info_dict': {
829 'id': 'gVfLd0zydlo',
830 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
831 },
832 'playlist_count': 2,
833 'skip': 'Not multifeed anymore',
834 },
835 {
836 'url': 'https://vid.plus/FlRa-iH7PGw',
837 'only_matching': True,
838 },
839 {
840 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
841 'only_matching': True,
842 },
843 {
844 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
845 # Also tests cut-off URL expansion in video description (see
846 # https://github.com/ytdl-org/youtube-dl/issues/1892,
847 # https://github.com/ytdl-org/youtube-dl/issues/8164)
848 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
849 'info_dict': {
850 'id': 'lsguqyKfVQg',
851 'ext': 'mp4',
852 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
853 'alt_title': 'Dark Walk - Position Music',
854 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
855 'duration': 133,
856 'upload_date': '20151119',
857 'uploader_id': 'IronSoulElf',
858 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
859 'uploader': 'IronSoulElf',
860 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
861 'track': 'Dark Walk - Position Music',
862 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
863 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
864 },
865 'params': {
866 'skip_download': True,
867 },
868 },
869 {
870 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
871 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
872 'only_matching': True,
873 },
874 {
875 # Video with yt:stretch=17:0
876 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
877 'info_dict': {
878 'id': 'Q39EVAstoRM',
879 'ext': 'mp4',
880 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
881 'description': 'md5:ee18a25c350637c8faff806845bddee9',
882 'upload_date': '20151107',
883 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
884 'uploader': 'CH GAMER DROID',
885 },
886 'params': {
887 'skip_download': True,
888 },
889 'skip': 'This video does not exist.',
890 },
891 {
892 # Video licensed under Creative Commons
893 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
894 'info_dict': {
895 'id': 'M4gD1WSo5mA',
896 'ext': 'mp4',
897 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
898 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
899 'duration': 721,
900 'upload_date': '20150127',
901 'uploader_id': 'BerkmanCenter',
902 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
903 'uploader': 'The Berkman Klein Center for Internet & Society',
904 'license': 'Creative Commons Attribution license (reuse allowed)',
905 },
906 'params': {
907 'skip_download': True,
908 },
909 },
910 {
911 # Channel-like uploader_url
912 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
913 'info_dict': {
914 'id': 'eQcmzGIKrzg',
915 'ext': 'mp4',
916 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
917 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
918 'duration': 4060,
919 'upload_date': '20151119',
920 'uploader': 'Bernie Sanders',
921 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
922 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
923 'license': 'Creative Commons Attribution license (reuse allowed)',
924 },
925 'params': {
926 'skip_download': True,
927 },
928 },
929 {
930 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
931 'only_matching': True,
932 },
933 {
934 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
935 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
936 'only_matching': True,
937 },
938 {
939 # Rental video preview
940 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
941 'info_dict': {
942 'id': 'uGpuVWrhIzE',
943 'ext': 'mp4',
944 'title': 'Piku - Trailer',
945 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
946 'upload_date': '20150811',
947 'uploader': 'FlixMatrix',
948 'uploader_id': 'FlixMatrixKaravan',
949 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
950 'license': 'Standard YouTube License',
951 },
952 'params': {
953 'skip_download': True,
954 },
955 'skip': 'This video is not available.',
956 },
957 {
958 # YouTube Red video with episode data
959 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
960 'info_dict': {
961 'id': 'iqKdEhx-dD4',
962 'ext': 'mp4',
963 'title': 'Isolation - Mind Field (Ep 1)',
964 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
965 'duration': 2085,
966 'upload_date': '20170118',
967 'uploader': 'Vsauce',
968 'uploader_id': 'Vsauce',
969 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
970 'series': 'Mind Field',
971 'season_number': 1,
972 'episode_number': 1,
973 },
974 'params': {
975 'skip_download': True,
976 },
977 'expected_warnings': [
978 'Skipping DASH manifest',
979 ],
980 },
981 {
982 # The following content has been identified by the YouTube community
983 # as inappropriate or offensive to some audiences.
984 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
985 'info_dict': {
986 'id': '6SJNVb0GnPI',
987 'ext': 'mp4',
988 'title': 'Race Differences in Intelligence',
989 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
990 'duration': 965,
991 'upload_date': '20140124',
992 'uploader': 'New Century Foundation',
993 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
994 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
995 },
996 'params': {
997 'skip_download': True,
998 },
999 },
1000 {
1001 # itag 212
1002 'url': '1t24XAntNCY',
1003 'only_matching': True,
1004 },
1005 {
1006 # geo restricted to JP
1007 'url': 'sJL6WA-aGkQ',
1008 'only_matching': True,
1009 },
1010 {
1011 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
1012 'only_matching': True,
1013 },
1014 {
1015 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1016 'only_matching': True,
1017 },
1018 {
1019 # DRM protected
1020 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1021 'only_matching': True,
1022 },
1023 {
1024 # Video with unsupported adaptive stream type formats
1025 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1026 'info_dict': {
1027 'id': 'Z4Vy8R84T1U',
1028 'ext': 'mp4',
1029 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1030 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1031 'duration': 433,
1032 'upload_date': '20130923',
1033 'uploader': 'Amelia Putri Harwita',
1034 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1035 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1036 'formats': 'maxcount:10',
1037 },
1038 'params': {
1039 'skip_download': True,
1040 'youtube_include_dash_manifest': False,
1041 },
1042 'skip': 'not actual anymore',
1043 },
1044 {
1045 # Youtube Music Auto-generated description
1046 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1047 'info_dict': {
1048 'id': 'MgNrAu2pzNs',
1049 'ext': 'mp4',
1050 'title': 'Voyeur Girl',
1051 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1052 'upload_date': '20190312',
1053 'uploader': 'Stephen - Topic',
1054 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
1055 'artist': 'Stephen',
1056 'track': 'Voyeur Girl',
1057 'album': 'it\'s too much love to know my dear',
1058 'release_date': '20190313',
1059 'release_year': 2019,
1060 },
1061 'params': {
1062 'skip_download': True,
1063 },
1064 },
1065 {
1066 # Youtube Music Auto-generated description
1067 # Retrieve 'artist' field from 'Artist:' in video description
1068 # when it is present on youtube music video
1069 'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY',
1070 'info_dict': {
1071 'id': 'k0jLE7tTwjY',
1072 'ext': 'mp4',
1073 'title': 'Latch Feat. Sam Smith',
1074 'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335',
1075 'upload_date': '20150110',
1076 'uploader': 'Various Artists - Topic',
1077 'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w',
1078 'artist': 'Disclosure',
1079 'track': 'Latch Feat. Sam Smith',
1080 'album': 'Latch Featuring Sam Smith',
1081 'release_date': '20121008',
1082 'release_year': 2012,
1083 },
1084 'params': {
1085 'skip_download': True,
1086 },
1087 },
1088 {
1089 # Youtube Music Auto-generated description
1090 # handle multiple artists on youtube music video
1091 'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA',
1092 'info_dict': {
1093 'id': '74qn0eJSjpA',
1094 'ext': 'mp4',
1095 'title': 'Eastside',
1096 'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2',
1097 'upload_date': '20180710',
1098 'uploader': 'Benny Blanco - Topic',
1099 'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A',
1100 'artist': 'benny blanco, Halsey, Khalid',
1101 'track': 'Eastside',
1102 'album': 'Eastside',
1103 'release_date': '20180713',
1104 'release_year': 2018,
1105 },
1106 'params': {
1107 'skip_download': True,
1108 },
1109 },
1110 {
1111 # Youtube Music Auto-generated description
1112 # handle youtube music video with release_year and no release_date
1113 'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M',
1114 'info_dict': {
1115 'id': '-hcAI0g-f5M',
1116 'ext': 'mp4',
1117 'title': 'Put It On Me',
1118 'description': 'md5:f6422397c07c4c907c6638e1fee380a5',
1119 'upload_date': '20180426',
1120 'uploader': 'Matt Maeson - Topic',
1121 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ',
1122 'artist': 'Matt Maeson',
1123 'track': 'Put It On Me',
1124 'album': 'The Hearse',
1125 'release_date': None,
1126 'release_year': 2018,
1127 },
1128 'params': {
1129 'skip_download': True,
1130 },
1131 },
1132 {
1133 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1134 'only_matching': True,
1135 },
1136 {
1137 # invalid -> valid video id redirection
1138 'url': 'DJztXj2GPfl',
1139 'info_dict': {
1140 'id': 'DJztXj2GPfk',
1141 'ext': 'mp4',
1142 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1143 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1144 'upload_date': '20090125',
1145 'uploader': 'Prochorowka',
1146 'uploader_id': 'Prochorowka',
1147 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1148 'artist': 'Panjabi MC',
1149 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1150 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1151 },
1152 'params': {
1153 'skip_download': True,
1154 },
1155 },
1156 {
1157 # empty description results in an empty string
1158 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1159 'info_dict': {
1160 'id': 'x41yOUIvK2k',
1161 'ext': 'mp4',
1162 'title': 'IMG 3456',
1163 'description': '',
1164 'upload_date': '20170613',
1165 'uploader_id': 'ElevageOrVert',
1166 'uploader': 'ElevageOrVert',
1167 },
1168 'params': {
1169 'skip_download': True,
1170 },
1171 },
1172 ]
1173
1174 def __init__(self, *args, **kwargs):
1175 super(YoutubeIE, self).__init__(*args, **kwargs)
1176 self._player_cache = {}
1177
1178 def report_video_info_webpage_download(self, video_id):
1179 """Report attempt to download video info webpage."""
1180 self.to_screen('%s: Downloading video info webpage' % video_id)
1181
1182 def report_information_extraction(self, video_id):
1183 """Report attempt to extract video information."""
1184 self.to_screen('%s: Extracting video information' % video_id)
1185
1186 def report_unavailable_format(self, video_id, format):
1187 """Report extracted video URL."""
1188 self.to_screen('%s: Format %s not available' % (video_id, format))
1189
1190 def report_rtmp_download(self):
1191 """Indicate the download will use the RTMP protocol."""
1192 self.to_screen('RTMP download detected')
1193
1194 def _signature_cache_id(self, example_sig):
1195 """ Return a string representation of a signature """
1196 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
1197
1198 @classmethod
1199 def _extract_player_info(cls, player_url):
1200 for player_re in cls._PLAYER_INFO_RE:
1201 id_m = re.search(player_re, player_url)
1202 if id_m:
1203 break
1204 else:
1205 raise ExtractorError('Cannot identify player %r' % player_url)
1206 return id_m.group('ext'), id_m.group('id')
1207
1208 def _extract_signature_function(self, video_id, player_url, example_sig):
1209 player_type, player_id = self._extract_player_info(player_url)
1210
1211 # Read from filesystem cache
1212 func_id = '%s_%s_%s' % (
1213 player_type, player_id, self._signature_cache_id(example_sig))
1214 assert os.path.basename(func_id) == func_id
1215
1216 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
1217 if cache_spec is not None:
1218 return lambda s: ''.join(s[i] for i in cache_spec)
1219
1220 download_note = (
1221 'Downloading player %s' % player_url
1222 if self._downloader.params.get('verbose') else
1223 'Downloading %s player %s' % (player_type, player_id)
1224 )
1225 if player_type == 'js':
1226 code = self._download_webpage(
1227 player_url, video_id,
1228 note=download_note,
1229 errnote='Download of %s failed' % player_url)
1230 res = self._parse_sig_js(code)
1231 elif player_type == 'swf':
1232 urlh = self._request_webpage(
1233 player_url, video_id,
1234 note=download_note,
1235 errnote='Download of %s failed' % player_url)
1236 code = urlh.read()
1237 res = self._parse_sig_swf(code)
1238 else:
1239 assert False, 'Invalid player type %r' % player_type
1240
1241 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1242 cache_res = res(test_string)
1243 cache_spec = [ord(c) for c in cache_res]
1244
1245 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
1246 return res
1247
1248 def _print_sig_code(self, func, example_sig):
1249 def gen_sig_code(idxs):
1250 def _genslice(start, end, step):
1251 starts = '' if start == 0 else str(start)
1252 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
1253 steps = '' if step == 1 else (':%d' % step)
1254 return 's[%s%s%s]' % (starts, ends, steps)
1255
1256 step = None
1257 # Quelch pyflakes warnings - start will be set when step is set
1258 start = '(Never used)'
1259 for i, prev in zip(idxs[1:], idxs[:-1]):
1260 if step is not None:
1261 if i - prev == step:
1262 continue
1263 yield _genslice(start, prev, step)
1264 step = None
1265 continue
1266 if i - prev in [-1, 1]:
1267 step = i - prev
1268 start = prev
1269 continue
1270 else:
1271 yield 's[%d]' % prev
1272 if step is None:
1273 yield 's[%d]' % i
1274 else:
1275 yield _genslice(start, i, step)
1276
1277 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1278 cache_res = func(test_string)
1279 cache_spec = [ord(c) for c in cache_res]
1280 expr_code = ' + '.join(gen_sig_code(cache_spec))
1281 signature_id_tuple = '(%s)' % (
1282 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
1283 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
1284 ' return %s\n') % (signature_id_tuple, expr_code)
1285 self.to_screen('Extracted signature function:\n' + code)
1286
1287 def _parse_sig_js(self, jscode):
1288 funcname = self._search_regex(
1289 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1290 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1291 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1292 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1293 # Obsolete patterns
1294 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1295 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
1296 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1297 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1298 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1299 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1300 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1301 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
1302 jscode, 'Initial JS player signature function name', group='sig')
1303
1304 jsi = JSInterpreter(jscode)
1305 initial_function = jsi.extract_function(funcname)
1306 return lambda s: initial_function([s])
1307
1308 def _parse_sig_swf(self, file_contents):
1309 swfi = SWFInterpreter(file_contents)
1310 TARGET_CLASSNAME = 'SignatureDecipher'
1311 searched_class = swfi.extract_class(TARGET_CLASSNAME)
1312 initial_function = swfi.extract_function(searched_class, 'decipher')
1313 return lambda s: initial_function([s])
1314
1315 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1316 """Turn the encrypted s field into a working signature"""
1317
1318 if player_url is None:
1319 raise ExtractorError('Cannot decrypt signature without player_url')
1320
1321 if player_url.startswith('//'):
1322 player_url = 'https:' + player_url
1323 elif not re.match(r'https?://', player_url):
1324 player_url = compat_urlparse.urljoin(
1325 'https://www.youtube.com', player_url)
1326 try:
1327 player_id = (player_url, self._signature_cache_id(s))
1328 if player_id not in self._player_cache:
1329 func = self._extract_signature_function(
1330 video_id, player_url, s
1331 )
1332 self._player_cache[player_id] = func
1333 func = self._player_cache[player_id]
1334 if self._downloader.params.get('youtube_print_sig_code'):
1335 self._print_sig_code(func, s)
1336 return func(s)
1337 except Exception as e:
1338 tb = traceback.format_exc()
1339 raise ExtractorError(
1340 'Signature extraction failed: ' + tb, cause=e)
1341
1342 def _get_subtitles(self, video_id, webpage, has_live_chat_replay):
1343 try:
1344 subs_doc = self._download_xml(
1345 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1346 video_id, note=False)
1347 except ExtractorError as err:
1348 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
1349 return {}
1350
1351 sub_lang_list = {}
1352 for track in subs_doc.findall('track'):
1353 lang = track.attrib['lang_code']
1354 if lang in sub_lang_list:
1355 continue
1356 sub_formats = []
1357 for ext in self._SUBTITLE_FORMATS:
1358 params = compat_urllib_parse_urlencode({
1359 'lang': lang,
1360 'v': video_id,
1361 'fmt': ext,
1362 'name': track.attrib['name'].encode('utf-8'),
1363 })
1364 sub_formats.append({
1365 'url': 'https://www.youtube.com/api/timedtext?' + params,
1366 'ext': ext,
1367 })
1368 sub_lang_list[lang] = sub_formats
1369 if has_live_chat_replay:
1370 sub_lang_list['live_chat'] = [
1371 {
1372 'video_id': video_id,
1373 'ext': 'json',
1374 'protocol': 'youtube_live_chat_replay',
1375 },
1376 ]
1377 if not sub_lang_list:
1378 self._downloader.report_warning('video doesn\'t have subtitles')
1379 return {}
1380 return sub_lang_list
1381
1382 def _get_ytplayer_config(self, video_id, webpage):
1383 patterns = (
1384 # User data may contain arbitrary character sequences that may affect
1385 # JSON extraction with regex, e.g. when '};' is contained the second
1386 # regex won't capture the whole JSON. Yet working around by trying more
1387 # concrete regex first keeping in mind proper quoted string handling
1388 # to be implemented in future that will replace this workaround (see
1389 # https://github.com/ytdl-org/youtube-dl/issues/7468,
1390 # https://github.com/ytdl-org/youtube-dl/pull/7599)
1391 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1392 r';ytplayer\.config\s*=\s*({.+?});',
1393 )
1394 config = self._search_regex(
1395 patterns, webpage, 'ytplayer.config', default=None)
1396 if config:
1397 return self._parse_json(
1398 uppercase_escape(config), video_id, fatal=False)
1399
1400 def _get_yt_initial_data(self, video_id, webpage):
1401 config = self._search_regex(
1402 (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
1403 r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
1404 webpage, 'ytInitialData', default=None)
1405 if config:
1406 return self._parse_json(
1407 uppercase_escape(config), video_id, fatal=False)
1408
1409 def _get_music_metadata_from_yt_initial(self, yt_initial):
1410 music_metadata = []
1411 key_map = {
1412 'Album': 'album',
1413 'Artist': 'artist',
1414 'Song': 'track'
1415 }
1416 contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'])
1417 if type(contents) is list:
1418 for content in contents:
1419 music_track = {}
1420 if type(content) is not dict:
1421 continue
1422 videoSecondaryInfoRenderer = try_get(content, lambda x: x['videoSecondaryInfoRenderer'])
1423 if type(videoSecondaryInfoRenderer) is not dict:
1424 continue
1425 rows = try_get(videoSecondaryInfoRenderer, lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'])
1426 if type(rows) is not list:
1427 continue
1428 for row in rows:
1429 metadataRowRenderer = try_get(row, lambda x: x['metadataRowRenderer'])
1430 if type(metadataRowRenderer) is not dict:
1431 continue
1432 key = try_get(metadataRowRenderer, lambda x: x['title']['simpleText'])
1433 value = try_get(metadataRowRenderer, lambda x: x['contents'][0]['simpleText']) or \
1434 try_get(metadataRowRenderer, lambda x: x['contents'][0]['runs'][0]['text'])
1435 if type(key) is not str or type(value) is not str:
1436 continue
1437 if key in key_map:
1438 if key_map[key] in music_track:
1439 # we've started on a new track
1440 music_metadata.append(music_track)
1441 music_track = {}
1442 music_track[key_map[key]] = value
1443 if len(music_track.keys()):
1444 music_metadata.append(music_track)
1445 return music_metadata
1446
1447 def _get_automatic_captions(self, video_id, webpage):
1448 """We need the webpage for getting the captions url, pass it as an
1449 argument to speed up the process."""
1450 self.to_screen('%s: Looking for automatic captions' % video_id)
1451 player_config = self._get_ytplayer_config(video_id, webpage)
1452 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
1453 if not player_config:
1454 self._downloader.report_warning(err_msg)
1455 return {}
1456 try:
1457 args = player_config['args']
1458 caption_url = args.get('ttsurl')
1459 if caption_url:
1460 timestamp = args['timestamp']
1461 # We get the available subtitles
1462 list_params = compat_urllib_parse_urlencode({
1463 'type': 'list',
1464 'tlangs': 1,
1465 'asrs': 1,
1466 })
1467 list_url = caption_url + '&' + list_params
1468 caption_list = self._download_xml(list_url, video_id)
1469 original_lang_node = caption_list.find('track')
1470 if original_lang_node is None:
1471 self._downloader.report_warning('Video doesn\'t have automatic captions')
1472 return {}
1473 original_lang = original_lang_node.attrib['lang_code']
1474 caption_kind = original_lang_node.attrib.get('kind', '')
1475
1476 sub_lang_list = {}
1477 for lang_node in caption_list.findall('target'):
1478 sub_lang = lang_node.attrib['lang_code']
1479 sub_formats = []
1480 for ext in self._SUBTITLE_FORMATS:
1481 params = compat_urllib_parse_urlencode({
1482 'lang': original_lang,
1483 'tlang': sub_lang,
1484 'fmt': ext,
1485 'ts': timestamp,
1486 'kind': caption_kind,
1487 })
1488 sub_formats.append({
1489 'url': caption_url + '&' + params,
1490 'ext': ext,
1491 })
1492 sub_lang_list[sub_lang] = sub_formats
1493 return sub_lang_list
1494
1495 def make_captions(sub_url, sub_langs):
1496 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1497 caption_qs = compat_parse_qs(parsed_sub_url.query)
1498 captions = {}
1499 for sub_lang in sub_langs:
1500 sub_formats = []
1501 for ext in self._SUBTITLE_FORMATS:
1502 caption_qs.update({
1503 'tlang': [sub_lang],
1504 'fmt': [ext],
1505 })
1506 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1507 query=compat_urllib_parse_urlencode(caption_qs, True)))
1508 sub_formats.append({
1509 'url': sub_url,
1510 'ext': ext,
1511 })
1512 captions[sub_lang] = sub_formats
1513 return captions
1514
1515 # New captions format as of 22.06.2017
1516 player_response = args.get('player_response')
1517 if player_response and isinstance(player_response, compat_str):
1518 player_response = self._parse_json(
1519 player_response, video_id, fatal=False)
1520 if player_response:
1521 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1522 caption_tracks = renderer['captionTracks']
1523 for caption_track in caption_tracks:
1524 if 'kind' not in caption_track:
1525 # not an automatic transcription
1526 continue
1527 base_url = caption_track['baseUrl']
1528 sub_lang_list = []
1529 for lang in renderer['translationLanguages']:
1530 lang_code = lang.get('languageCode')
1531 if lang_code:
1532 sub_lang_list.append(lang_code)
1533 return make_captions(base_url, sub_lang_list)
1534
1535 self._downloader.report_warning("Couldn't find automatic captions for %s" % video_id)
1536 return {}
1537 # Some videos don't provide ttsurl but rather caption_tracks and
1538 # caption_translation_languages (e.g. 20LmZk1hakA)
1539 # Does not used anymore as of 22.06.2017
1540 caption_tracks = args['caption_tracks']
1541 caption_translation_languages = args['caption_translation_languages']
1542 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
1543 sub_lang_list = []
1544 for lang in caption_translation_languages.split(','):
1545 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1546 sub_lang = lang_qs.get('lc', [None])[0]
1547 if sub_lang:
1548 sub_lang_list.append(sub_lang)
1549 return make_captions(caption_url, sub_lang_list)
1550 # An extractor error can be raise by the download process if there are
1551 # no automatic captions but there are subtitles
1552 except (KeyError, IndexError, ExtractorError):
1553 self._downloader.report_warning(err_msg)
1554 return {}
1555
1556 def _mark_watched(self, video_id, video_info, player_response):
1557 playback_url = url_or_none(try_get(
1558 player_response,
1559 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
1560 video_info, lambda x: x['videostats_playback_base_url'][0]))
1561 if not playback_url:
1562 return
1563 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1564 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1565
1566 # cpn generation algorithm is reverse engineered from base.js.
1567 # In fact it works even with dummy cpn.
1568 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1569 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1570
1571 qs.update({
1572 'ver': ['2'],
1573 'cpn': [cpn],
1574 })
1575 playback_url = compat_urlparse.urlunparse(
1576 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
1577
1578 self._download_webpage(
1579 playback_url, video_id, 'Marking watched',
1580 'Unable to mark watched', fatal=False)
1581
1582 @staticmethod
1583 def _extract_urls(webpage):
1584 # Embedded YouTube player
1585 entries = [
1586 unescapeHTML(mobj.group('url'))
1587 for mobj in re.finditer(r'''(?x)
1588 (?:
1589 <iframe[^>]+?src=|
1590 data-video-url=|
1591 <embed[^>]+?src=|
1592 embedSWF\(?:\s*|
1593 <object[^>]+data=|
1594 new\s+SWFObject\(
1595 )
1596 (["\'])
1597 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1598 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
1599 \1''', webpage)]
1600
1601 # lazyYT YouTube embed
1602 entries.extend(list(map(
1603 unescapeHTML,
1604 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1605
1606 # Wordpress "YouTube Video Importer" plugin
1607 matches = re.findall(r'''(?x)<div[^>]+
1608 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1609 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1610 entries.extend(m[-1] for m in matches)
1611
1612 return entries
1613
1614 @staticmethod
1615 def _extract_url(webpage):
1616 urls = YoutubeIE._extract_urls(webpage)
1617 return urls[0] if urls else None
1618
1619 @classmethod
1620 def extract_id(cls, url):
1621 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1622 if mobj is None:
1623 raise ExtractorError('Invalid URL: %s' % url)
1624 video_id = mobj.group(2)
1625 return video_id
1626
1627 def _extract_chapters_from_json(self, webpage, video_id, duration):
1628 if not webpage:
1629 return
1630 initial_data = self._parse_json(
1631 self._search_regex(
1632 r'window\["ytInitialData"\] = (.+);\n', webpage,
1633 'player args', default='{}'),
1634 video_id, fatal=False)
1635 if not initial_data or not isinstance(initial_data, dict):
1636 return
1637 chapters_list = try_get(
1638 initial_data,
1639 lambda x: x['playerOverlays']
1640 ['playerOverlayRenderer']
1641 ['decoratedPlayerBarRenderer']
1642 ['decoratedPlayerBarRenderer']
1643 ['playerBar']
1644 ['chapteredPlayerBarRenderer']
1645 ['chapters'],
1646 list)
1647 if not chapters_list:
1648 return
1649
1650 def chapter_time(chapter):
1651 return float_or_none(
1652 try_get(
1653 chapter,
1654 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1655 int),
1656 scale=1000)
1657 chapters = []
1658 for next_num, chapter in enumerate(chapters_list, start=1):
1659 start_time = chapter_time(chapter)
1660 if start_time is None:
1661 continue
1662 end_time = (chapter_time(chapters_list[next_num])
1663 if next_num < len(chapters_list) else duration)
1664 if end_time is None:
1665 continue
1666 title = try_get(
1667 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1668 compat_str)
1669 chapters.append({
1670 'start_time': start_time,
1671 'end_time': end_time,
1672 'title': title,
1673 })
1674 return chapters
1675
1676 @staticmethod
1677 def _extract_chapters_from_description(description, duration):
1678 if not description:
1679 return None
1680 chapter_lines = re.findall(
1681 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1682 description)
1683 if not chapter_lines:
1684 return None
1685 chapters = []
1686 for next_num, (chapter_line, time_point) in enumerate(
1687 chapter_lines, start=1):
1688 start_time = parse_duration(time_point)
1689 if start_time is None:
1690 continue
1691 if start_time > duration:
1692 break
1693 end_time = (duration if next_num == len(chapter_lines)
1694 else parse_duration(chapter_lines[next_num][1]))
1695 if end_time is None:
1696 continue
1697 if end_time > duration:
1698 end_time = duration
1699 if start_time > end_time:
1700 break
1701 chapter_title = re.sub(
1702 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1703 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1704 chapters.append({
1705 'start_time': start_time,
1706 'end_time': end_time,
1707 'title': chapter_title,
1708 })
1709 return chapters
1710
1711 def _extract_chapters(self, webpage, description, video_id, duration):
1712 return (self._extract_chapters_from_json(webpage, video_id, duration)
1713 or self._extract_chapters_from_description(description, duration))
1714
1715 def _real_extract(self, url):
1716 url, smuggled_data = unsmuggle_url(url, {})
1717
1718 proto = (
1719 'http' if self._downloader.params.get('prefer_insecure', False)
1720 else 'https')
1721
1722 start_time = None
1723 end_time = None
1724 parsed_url = compat_urllib_parse_urlparse(url)
1725 for component in [parsed_url.fragment, parsed_url.query]:
1726 query = compat_parse_qs(component)
1727 if start_time is None and 't' in query:
1728 start_time = parse_duration(query['t'][0])
1729 if start_time is None and 'start' in query:
1730 start_time = parse_duration(query['start'][0])
1731 if end_time is None and 'end' in query:
1732 end_time = parse_duration(query['end'][0])
1733
1734 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1735 mobj = re.search(self._NEXT_URL_RE, url)
1736 if mobj:
1737 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
1738 video_id = self.extract_id(url)
1739
1740 # Get video webpage
1741 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
1742 video_webpage, urlh = self._download_webpage_handle(url, video_id)
1743
1744 qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query)
1745 video_id = qs.get('v', [None])[0] or video_id
1746
1747 # Attempt to extract SWF player URL
1748 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1749 if mobj is not None:
1750 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1751 else:
1752 player_url = None
1753
1754 dash_mpds = []
1755
1756 def add_dash_mpd(video_info):
1757 dash_mpd = video_info.get('dashmpd')
1758 if dash_mpd and dash_mpd[0] not in dash_mpds:
1759 dash_mpds.append(dash_mpd[0])
1760
1761 def add_dash_mpd_pr(pl_response):
1762 dash_mpd = url_or_none(try_get(
1763 pl_response, lambda x: x['streamingData']['dashManifestUrl'],
1764 compat_str))
1765 if dash_mpd and dash_mpd not in dash_mpds:
1766 dash_mpds.append(dash_mpd)
1767
1768 is_live = None
1769 view_count = None
1770
1771 def extract_view_count(v_info):
1772 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1773
1774 def extract_player_response(player_response, video_id):
1775 pl_response = str_or_none(player_response)
1776 if not pl_response:
1777 return
1778 pl_response = self._parse_json(pl_response, video_id, fatal=False)
1779 if isinstance(pl_response, dict):
1780 add_dash_mpd_pr(pl_response)
1781 return pl_response
1782
1783 def extract_embedded_config(embed_webpage, video_id):
1784 embedded_config = self._search_regex(
1785 r'setConfig\(({.*})\);',
1786 embed_webpage, 'ytInitialData', default=None)
1787 if embedded_config:
1788 return embedded_config
1789
1790 player_response = {}
1791
1792 # Get video info
1793 video_info = {}
1794 embed_webpage = None
1795 if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+'
1796 or re.search(r'player-age-gate-content">', video_webpage) is not None):
1797 cookie_keys = self._get_cookies('https://www.youtube.com').keys()
1798 age_gate = True
1799 # We simulate the access to the video from www.youtube.com/v/{video_id}
1800 # this can be viewed without login into Youtube
1801 url = proto + '://www.youtube.com/embed/%s' % video_id
1802 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
1803 ext = extract_embedded_config(embed_webpage, video_id)
1804 # playabilityStatus = re.search(r'{\\\"status\\\":\\\"(?P<playabilityStatus>[^\"]+)\\\"', ext)
1805 playable_in_embed = re.search(r'{\\\"playableInEmbed\\\":(?P<playableinEmbed>[^\,]+)', ext)
1806 if not playable_in_embed:
1807 self.to_screen('Could not determine whether playabale in embed for video %s' % video_id)
1808 playable_in_embed = ''
1809 else:
1810 playable_in_embed = playable_in_embed.group('playableinEmbed')
1811 # check if video is only playable on youtube in other words not playable in embed - if so it requires auth (cookies)
1812 # if re.search(r'player-unavailable">', embed_webpage) is not None:
1813 if playable_in_embed == 'false':
1814 '''
1815 # TODO apply this patch when Support for Python 2.6(!) and above drops
1816 if ({'VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID'} <= cookie_keys
1817 or {'VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO'} <= cookie_keys):
1818 '''
1819 if (set(('VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID')) <= set(cookie_keys)
1820 or set(('VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO')) <= set(cookie_keys)):
1821 age_gate = False
1822 # Try looking directly into the video webpage
1823 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1824 if ytplayer_config:
1825 args = ytplayer_config['args']
1826 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1827 # Convert to the same format returned by compat_parse_qs
1828 video_info = dict((k, [v]) for k, v in args.items())
1829 add_dash_mpd(video_info)
1830 # Rental video is not rented but preview is available (e.g.
1831 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1832 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1833 if not video_info and args.get('ypc_vid'):
1834 return self.url_result(
1835 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1836 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1837 is_live = True
1838 if not player_response:
1839 player_response = extract_player_response(args.get('player_response'), video_id)
1840 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1841 add_dash_mpd_pr(player_response)
1842 else:
1843 raise ExtractorError('Video is age restricted and only playable on Youtube. Requires cookies!', expected=True)
1844 else:
1845 data = compat_urllib_parse_urlencode({
1846 'video_id': video_id,
1847 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1848 'sts': self._search_regex(
1849 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1850 })
1851 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1852 try:
1853 video_info_webpage = self._download_webpage(
1854 video_info_url, video_id,
1855 note='Refetching age-gated info webpage',
1856 errnote='unable to download video info webpage')
1857 except ExtractorError:
1858 video_info_webpage = None
1859 if video_info_webpage:
1860 video_info = compat_parse_qs(video_info_webpage)
1861 pl_response = video_info.get('player_response', [None])[0]
1862 player_response = extract_player_response(pl_response, video_id)
1863 add_dash_mpd(video_info)
1864 view_count = extract_view_count(video_info)
1865 else:
1866 age_gate = False
1867 # Try looking directly into the video webpage
1868 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1869 if ytplayer_config:
1870 args = ytplayer_config['args']
1871 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1872 # Convert to the same format returned by compat_parse_qs
1873 video_info = dict((k, [v]) for k, v in args.items())
1874 add_dash_mpd(video_info)
1875 # Rental video is not rented but preview is available (e.g.
1876 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1877 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1878 if not video_info and args.get('ypc_vid'):
1879 return self.url_result(
1880 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1881 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1882 is_live = True
1883 if not player_response:
1884 player_response = extract_player_response(args.get('player_response'), video_id)
1885 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1886 add_dash_mpd_pr(player_response)
1887
1888 def extract_unavailable_message():
1889 messages = []
1890 for tag, kind in (('h1', 'message'), ('div', 'submessage')):
1891 msg = self._html_search_regex(
1892 r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
1893 video_webpage, 'unavailable %s' % kind, default=None)
1894 if msg:
1895 messages.append(msg)
1896 if messages:
1897 return '\n'.join(messages)
1898
1899 if not video_info and not player_response:
1900 unavailable_message = extract_unavailable_message()
1901 if not unavailable_message:
1902 unavailable_message = 'Unable to extract video data'
1903 raise ExtractorError(
1904 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
1905
1906 if not isinstance(video_info, dict):
1907 video_info = {}
1908
1909 video_details = try_get(
1910 player_response, lambda x: x['videoDetails'], dict) or {}
1911
1912 microformat = try_get(
1913 player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}
1914
1915 video_title = video_info.get('title', [None])[0] or video_details.get('title')
1916 if not video_title:
1917 self._downloader.report_warning('Unable to extract video title')
1918 video_title = '_'
1919
1920 description_original = video_description = get_element_by_id("eow-description", video_webpage)
1921 if video_description:
1922
1923 def replace_url(m):
1924 redir_url = compat_urlparse.urljoin(url, m.group(1))
1925 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1926 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1927 qs = compat_parse_qs(parsed_redir_url.query)
1928 q = qs.get('q')
1929 if q and q[0]:
1930 return q[0]
1931 return redir_url
1932
1933 description_original = video_description = re.sub(r'''(?x)
1934 <a\s+
1935 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1936 (?:title|href)="([^"]+)"\s+
1937 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1938 class="[^"]*"[^>]*>
1939 [^<]+\.{3}\s*
1940 </a>
1941 ''', replace_url, video_description)
1942 video_description = clean_html(video_description)
1943 else:
1944 video_description = video_details.get('shortDescription')
1945 if video_description is None:
1946 video_description = self._html_search_meta('description', video_webpage)
1947
1948 if not smuggled_data.get('force_singlefeed', False):
1949 if not self._downloader.params.get('noplaylist'):
1950 multifeed_metadata_list = try_get(
1951 player_response,
1952 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
1953 compat_str) or try_get(
1954 video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
1955 if multifeed_metadata_list:
1956 entries = []
1957 feed_ids = []
1958 for feed in multifeed_metadata_list.split(','):
1959 # Unquote should take place before split on comma (,) since textual
1960 # fields may contain comma as well (see
1961 # https://github.com/ytdl-org/youtube-dl/issues/8536)
1962 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
1963
1964 def feed_entry(name):
1965 return try_get(feed_data, lambda x: x[name][0], compat_str)
1966
1967 feed_id = feed_entry('id')
1968 if not feed_id:
1969 continue
1970 feed_title = feed_entry('title')
1971 title = video_title
1972 if feed_title:
1973 title += ' (%s)' % feed_title
1974 entries.append({
1975 '_type': 'url_transparent',
1976 'ie_key': 'Youtube',
1977 'url': smuggle_url(
1978 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1979 {'force_singlefeed': True}),
1980 'title': title,
1981 })
1982 feed_ids.append(feed_id)
1983 self.to_screen(
1984 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1985 % (', '.join(feed_ids), video_id))
1986 return self.playlist_result(entries, video_id, video_title, video_description)
1987 else:
1988 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1989
1990 if view_count is None:
1991 view_count = extract_view_count(video_info)
1992 if view_count is None and video_details:
1993 view_count = int_or_none(video_details.get('viewCount'))
1994 if view_count is None and microformat:
1995 view_count = int_or_none(microformat.get('viewCount'))
1996
1997 if is_live is None:
1998 is_live = bool_or_none(video_details.get('isLive'))
1999
2000 has_live_chat_replay = False
2001 if not is_live:
2002 yt_initial_data = self._get_yt_initial_data(video_id, video_webpage)
2003 try:
2004 yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
2005 has_live_chat_replay = True
2006 except (KeyError, IndexError, TypeError):
2007 pass
2008
2009 # Check for "rental" videos
2010 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
2011 raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
2012
2013 def _extract_filesize(media_url):
2014 return int_or_none(self._search_regex(
2015 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
2016
2017 streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
2018 streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
2019
2020 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
2021 self.report_rtmp_download()
2022 formats = [{
2023 'format_id': '_rtmp',
2024 'protocol': 'rtmp',
2025 'url': video_info['conn'][0],
2026 'player_url': player_url,
2027 }]
2028 elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
2029 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
2030 if 'rtmpe%3Dyes' in encoded_url_map:
2031 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
2032 formats = []
2033 formats_spec = {}
2034 fmt_list = video_info.get('fmt_list', [''])[0]
2035 if fmt_list:
2036 for fmt in fmt_list.split(','):
2037 spec = fmt.split('/')
2038 if len(spec) > 1:
2039 width_height = spec[1].split('x')
2040 if len(width_height) == 2:
2041 formats_spec[spec[0]] = {
2042 'resolution': spec[1],
2043 'width': int_or_none(width_height[0]),
2044 'height': int_or_none(width_height[1]),
2045 }
2046 for fmt in streaming_formats:
2047 itag = str_or_none(fmt.get('itag'))
2048 if not itag:
2049 continue
2050 quality = fmt.get('quality')
2051 quality_label = fmt.get('qualityLabel') or quality
2052 formats_spec[itag] = {
2053 'asr': int_or_none(fmt.get('audioSampleRate')),
2054 'filesize': int_or_none(fmt.get('contentLength')),
2055 'format_note': quality_label,
2056 'fps': int_or_none(fmt.get('fps')),
2057 'height': int_or_none(fmt.get('height')),
2058 # bitrate for itag 43 is always 2147483647
2059 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
2060 'width': int_or_none(fmt.get('width')),
2061 }
2062
2063 for fmt in streaming_formats:
2064 if fmt.get('drmFamilies') or fmt.get('drm_families'):
2065 continue
2066 url = url_or_none(fmt.get('url'))
2067
2068 if not url:
2069 cipher = fmt.get('cipher') or fmt.get('signatureCipher')
2070 if not cipher:
2071 continue
2072 url_data = compat_parse_qs(cipher)
2073 url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
2074 if not url:
2075 continue
2076 else:
2077 cipher = None
2078 url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2079
2080 stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
2081 # Unsupported FORMAT_STREAM_TYPE_OTF
2082 if stream_type == 3:
2083 continue
2084
2085 format_id = fmt.get('itag') or url_data['itag'][0]
2086 if not format_id:
2087 continue
2088 format_id = compat_str(format_id)
2089
2090 if cipher:
2091 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
2092 ASSETS_RE = r'(?:"assets":.+?"js":\s*("[^"]+"))|(?:"jsUrl":\s*("[^"]+"))'
2093 jsplayer_url_json = self._search_regex(
2094 ASSETS_RE,
2095 embed_webpage if age_gate else video_webpage,
2096 'JS player URL (1)', default=None)
2097 if not jsplayer_url_json and not age_gate:
2098 # We need the embed website after all
2099 if embed_webpage is None:
2100 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
2101 embed_webpage = self._download_webpage(
2102 embed_url, video_id, 'Downloading embed webpage')
2103 jsplayer_url_json = self._search_regex(
2104 ASSETS_RE, embed_webpage, 'JS player URL')
2105
2106 player_url = json.loads(jsplayer_url_json)
2107 if player_url is None:
2108 player_url_json = self._search_regex(
2109 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
2110 video_webpage, 'age gate player URL')
2111 player_url = json.loads(player_url_json)
2112
2113 if 'sig' in url_data:
2114 url += '&signature=' + url_data['sig'][0]
2115 elif 's' in url_data:
2116 encrypted_sig = url_data['s'][0]
2117
2118 if self._downloader.params.get('verbose'):
2119 if player_url is None:
2120 player_desc = 'unknown'
2121 else:
2122 player_type, player_version = self._extract_player_info(player_url)
2123 player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version)
2124 parts_sizes = self._signature_cache_id(encrypted_sig)
2125 self.to_screen('{%s} signature length %s, %s' %
2126 (format_id, parts_sizes, player_desc))
2127
2128 signature = self._decrypt_signature(
2129 encrypted_sig, video_id, player_url, age_gate)
2130 sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
2131 url += '&%s=%s' % (sp, signature)
2132 if 'ratebypass' not in url:
2133 url += '&ratebypass=yes'
2134
2135 dct = {
2136 'format_id': format_id,
2137 'url': url,
2138 'player_url': player_url,
2139 }
2140 if format_id in self._formats:
2141 dct.update(self._formats[format_id])
2142 if format_id in formats_spec:
2143 dct.update(formats_spec[format_id])
2144
2145 # Some itags are not included in DASH manifest thus corresponding formats will
2146 # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).
2147 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
2148 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
2149 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
2150
2151 if width is None:
2152 width = int_or_none(fmt.get('width'))
2153 if height is None:
2154 height = int_or_none(fmt.get('height'))
2155
2156 filesize = int_or_none(url_data.get(
2157 'clen', [None])[0]) or _extract_filesize(url)
2158
2159 quality = url_data.get('quality', [None])[0] or fmt.get('quality')
2160 quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
2161
2162 tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
2163 or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
2164 fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
2165
2166 more_fields = {
2167 'filesize': filesize,
2168 'tbr': tbr,
2169 'width': width,
2170 'height': height,
2171 'fps': fps,
2172 'format_note': quality_label or quality,
2173 }
2174 for key, value in more_fields.items():
2175 if value:
2176 dct[key] = value
2177 type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
2178 if type_:
2179 type_split = type_.split(';')
2180 kind_ext = type_split[0].split('/')
2181 if len(kind_ext) == 2:
2182 kind, _ = kind_ext
2183 dct['ext'] = mimetype2ext(type_split[0])
2184 if kind in ('audio', 'video'):
2185 codecs = None
2186 for mobj in re.finditer(
2187 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
2188 if mobj.group('key') == 'codecs':
2189 codecs = mobj.group('val')
2190 break
2191 if codecs:
2192 dct.update(parse_codecs(codecs))
2193 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
2194 dct['downloader_options'] = {
2195 # Youtube throttles chunks >~10M
2196 'http_chunk_size': 10485760,
2197 }
2198 formats.append(dct)
2199 else:
2200 manifest_url = (
2201 url_or_none(try_get(
2202 player_response,
2203 lambda x: x['streamingData']['hlsManifestUrl'],
2204 compat_str))
2205 or url_or_none(try_get(
2206 video_info, lambda x: x['hlsvp'][0], compat_str)))
2207 if manifest_url:
2208 formats = []
2209 m3u8_formats = self._extract_m3u8_formats(
2210 manifest_url, video_id, 'mp4', fatal=False)
2211 for a_format in m3u8_formats:
2212 itag = self._search_regex(
2213 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
2214 if itag:
2215 a_format['format_id'] = itag
2216 if itag in self._formats:
2217 dct = self._formats[itag].copy()
2218 dct.update(a_format)
2219 a_format = dct
2220 a_format['player_url'] = player_url
2221 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
2222 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
2223 if self._downloader.params.get('youtube_include_hls_manifest', True):
2224 formats.append(a_format)
2225 else:
2226 error_message = extract_unavailable_message()
2227 if not error_message:
2228 error_message = clean_html(try_get(
2229 player_response, lambda x: x['playabilityStatus']['reason'],
2230 compat_str))
2231 if not error_message:
2232 error_message = clean_html(
2233 try_get(video_info, lambda x: x['reason'][0], compat_str))
2234 if error_message:
2235 raise ExtractorError(error_message, expected=True)
2236 raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
2237
2238 # uploader
2239 video_uploader = try_get(
2240 video_info, lambda x: x['author'][0],
2241 compat_str) or str_or_none(video_details.get('author'))
2242 if video_uploader:
2243 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
2244 else:
2245 self._downloader.report_warning('unable to extract uploader name')
2246
2247 # uploader_id
2248 video_uploader_id = None
2249 video_uploader_url = None
2250 mobj = re.search(
2251 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
2252 video_webpage)
2253 if mobj is not None:
2254 video_uploader_id = mobj.group('uploader_id')
2255 video_uploader_url = mobj.group('uploader_url')
2256 else:
2257 owner_profile_url = url_or_none(microformat.get('ownerProfileUrl'))
2258 if owner_profile_url:
2259 video_uploader_id = self._search_regex(
2260 r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id',
2261 default=None)
2262 video_uploader_url = owner_profile_url
2263
2264 channel_id = (
2265 str_or_none(video_details.get('channelId'))
2266 or self._html_search_meta(
2267 'channelId', video_webpage, 'channel id', default=None)
2268 or self._search_regex(
2269 r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
2270 video_webpage, 'channel id', default=None, group='id'))
2271 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
2272
2273 thumbnails = []
2274 thumbnails_list = try_get(
2275 video_details, lambda x: x['thumbnail']['thumbnails'], list) or []
2276 for t in thumbnails_list:
2277 if not isinstance(t, dict):
2278 continue
2279 thumbnail_url = url_or_none(t.get('url'))
2280 if not thumbnail_url:
2281 continue
2282 thumbnails.append({
2283 'url': thumbnail_url,
2284 'width': int_or_none(t.get('width')),
2285 'height': int_or_none(t.get('height')),
2286 })
2287
2288 if not thumbnails:
2289 video_thumbnail = None
2290 # We try first to get a high quality image:
2291 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
2292 video_webpage, re.DOTALL)
2293 if m_thumb is not None:
2294 video_thumbnail = m_thumb.group(1)
2295 thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str)
2296 if thumbnail_url:
2297 video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url)
2298 if video_thumbnail:
2299 thumbnails.append({'url': video_thumbnail})
2300
2301 # upload date
2302 upload_date = self._html_search_meta(
2303 'datePublished', video_webpage, 'upload date', default=None)
2304 if not upload_date:
2305 upload_date = self._search_regex(
2306 [r'(?s)id="eow-date.*?>(.*?)</span>',
2307 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
2308 video_webpage, 'upload date', default=None)
2309 if not upload_date:
2310 upload_date = microformat.get('publishDate') or microformat.get('uploadDate')
2311 upload_date = unified_strdate(upload_date)
2312
2313 video_license = self._html_search_regex(
2314 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
2315 video_webpage, 'license', default=None)
2316
2317 m_music = re.search(
2318 r'''(?x)
2319 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
2320 <ul[^>]*>\s*
2321 <li>(?P<title>.+?)
2322 by (?P<creator>.+?)
2323 (?:
2324 \(.+?\)|
2325 <a[^>]*
2326 (?:
2327 \bhref=["\']/red[^>]*>| # drop possible
2328 >\s*Listen ad-free with YouTube Red # YouTube Red ad
2329 )
2330 .*?
2331 )?</li
2332 ''',
2333 video_webpage)
2334 if m_music:
2335 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
2336 video_creator = clean_html(m_music.group('creator'))
2337 else:
2338 video_alt_title = video_creator = None
2339
2340 def extract_meta(field):
2341 return self._html_search_regex(
2342 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
2343 video_webpage, field, default=None)
2344
2345 track = extract_meta('Song')
2346 artist = extract_meta('Artist')
2347 album = extract_meta('Album')
2348
2349 # Youtube Music Auto-generated description
2350 release_date = release_year = None
2351 if video_description:
2352 mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description)
2353 if mobj:
2354 if not track:
2355 track = mobj.group('track').strip()
2356 if not artist:
2357 artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))
2358 if not album:
2359 album = mobj.group('album'.strip())
2360 release_year = mobj.group('release_year')
2361 release_date = mobj.group('release_date')
2362 if release_date:
2363 release_date = release_date.replace('-', '')
2364 if not release_year:
2365 release_year = int(release_date[:4])
2366 if release_year:
2367 release_year = int(release_year)
2368
2369 yt_initial = self._get_yt_initial_data(video_id, video_webpage)
2370 if yt_initial:
2371 music_metadata = self._get_music_metadata_from_yt_initial(yt_initial)
2372 if len(music_metadata):
2373 album = music_metadata[0].get('album')
2374 artist = music_metadata[0].get('artist')
2375 track = music_metadata[0].get('track')
2376
2377 m_episode = re.search(
2378 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
2379 video_webpage)
2380 if m_episode:
2381 series = unescapeHTML(m_episode.group('series'))
2382 season_number = int(m_episode.group('season'))
2383 episode_number = int(m_episode.group('episode'))
2384 else:
2385 series = season_number = episode_number = None
2386
2387 m_cat_container = self._search_regex(
2388 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
2389 video_webpage, 'categories', default=None)
2390 category = None
2391 if m_cat_container:
2392 category = self._html_search_regex(
2393 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
2394 default=None)
2395 if not category:
2396 category = try_get(
2397 microformat, lambda x: x['category'], compat_str)
2398 video_categories = None if category is None else [category]
2399
2400 video_tags = [
2401 unescapeHTML(m.group('content'))
2402 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
2403 if not video_tags:
2404 video_tags = try_get(video_details, lambda x: x['keywords'], list)
2405
2406 def _extract_count(count_name):
2407 return str_to_int(self._search_regex(
2408 r'"accessibilityData":\{"label":"([\d,\w]+) %ss"\}'
2409 % re.escape(count_name),
2410 video_webpage, count_name, default=None))
2411
2412 like_count = _extract_count('like')
2413 dislike_count = _extract_count('dislike')
2414
2415 if view_count is None:
2416 view_count = str_to_int(self._search_regex(
2417 r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
2418 'view count', default=None))
2419
2420 average_rating = (
2421 float_or_none(video_details.get('averageRating'))
2422 or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
2423
2424 # subtitles
2425 video_subtitles = self.extract_subtitles(
2426 video_id, video_webpage, has_live_chat_replay)
2427 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
2428
2429 video_duration = try_get(
2430 video_info, lambda x: int_or_none(x['length_seconds'][0]))
2431 if not video_duration:
2432 video_duration = int_or_none(video_details.get('lengthSeconds'))
2433 if not video_duration:
2434 video_duration = parse_duration(self._html_search_meta(
2435 'duration', video_webpage, 'video duration'))
2436
2437 # Get Subscriber Count of channel
2438 subscriber_count = parse_count(self._search_regex(
2439 r'"text":"([\d\.]+\w?) subscribers"',
2440 video_webpage,
2441 'subscriber count',
2442 default=None
2443 ))
2444
2445 # annotations
2446 video_annotations = None
2447 if self._downloader.params.get('writeannotations', False):
2448 xsrf_token = self._search_regex(
2449 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2',
2450 video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
2451 invideo_url = try_get(
2452 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
2453 if xsrf_token and invideo_url:
2454 xsrf_field_name = self._search_regex(
2455 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
2456 video_webpage, 'xsrf field name',
2457 group='xsrf_field_name', default='session_token')
2458 video_annotations = self._download_webpage(
2459 self._proto_relative_url(invideo_url),
2460 video_id, note='Downloading annotations',
2461 errnote='Unable to download video annotations', fatal=False,
2462 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
2463
2464 chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
2465
2466 # Look for the DASH manifest
2467 if self._downloader.params.get('youtube_include_dash_manifest', True):
2468 dash_mpd_fatal = True
2469 for mpd_url in dash_mpds:
2470 dash_formats = {}
2471 try:
2472 def decrypt_sig(mobj):
2473 s = mobj.group(1)
2474 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2475 return '/signature/%s' % dec_s
2476
2477 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2478
2479 for df in self._extract_mpd_formats(
2480 mpd_url, video_id, fatal=dash_mpd_fatal,
2481 formats_dict=self._formats):
2482 if not df.get('filesize'):
2483 df['filesize'] = _extract_filesize(df['url'])
2484 # Do not overwrite DASH format found in some previous DASH manifest
2485 if df['format_id'] not in dash_formats:
2486 dash_formats[df['format_id']] = df
2487 # Additional DASH manifests may end up in HTTP Error 403 therefore
2488 # allow them to fail without bug report message if we already have
2489 # some DASH manifest succeeded. This is temporary workaround to reduce
2490 # burst of bug reports until we figure out the reason and whether it
2491 # can be fixed at all.
2492 dash_mpd_fatal = False
2493 except (ExtractorError, KeyError) as e:
2494 self.report_warning(
2495 'Skipping DASH manifest: %r' % e, video_id)
2496 if dash_formats:
2497 # Remove the formats we found through non-DASH, they
2498 # contain less info and it can be wrong, because we use
2499 # fixed values (for example the resolution). See
2500 # https://github.com/ytdl-org/youtube-dl/issues/5774 for an
2501 # example.
2502 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
2503 formats.extend(dash_formats.values())
2504
2505 # Check for malformed aspect ratio
2506 stretched_m = re.search(
2507 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2508 video_webpage)
2509 if stretched_m:
2510 w = float(stretched_m.group('w'))
2511 h = float(stretched_m.group('h'))
2512 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2513 # We will only process correct ratios.
2514 if w > 0 and h > 0:
2515 ratio = w / h
2516 for f in formats:
2517 if f.get('vcodec') != 'none':
2518 f['stretched_ratio'] = ratio
2519
2520 if not formats:
2521 if 'reason' in video_info:
2522 if 'The uploader has not made this video available in your country.' in video_info['reason']:
2523 regions_allowed = self._html_search_meta(
2524 'regionsAllowed', video_webpage, default=None)
2525 countries = regions_allowed.split(',') if regions_allowed else None
2526 self.raise_geo_restricted(
2527 msg=video_info['reason'][0], countries=countries)
2528 reason = video_info['reason'][0]
2529 if 'Invalid parameters' in reason:
2530 unavailable_message = extract_unavailable_message()
2531 if unavailable_message:
2532 reason = unavailable_message
2533 raise ExtractorError(
2534 'YouTube said: %s' % reason,
2535 expected=True, video_id=video_id)
2536 if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']):
2537 raise ExtractorError('This video is DRM protected.', expected=True)
2538
2539 self._sort_formats(formats)
2540
2541 self.mark_watched(video_id, video_info, player_response)
2542
2543 return {
2544 'id': video_id,
2545 'uploader': video_uploader,
2546 'uploader_id': video_uploader_id,
2547 'uploader_url': video_uploader_url,
2548 'channel_id': channel_id,
2549 'channel_url': channel_url,
2550 'upload_date': upload_date,
2551 'license': video_license,
2552 'creator': video_creator or artist,
2553 'title': video_title,
2554 'alt_title': video_alt_title or track,
2555 'thumbnails': thumbnails,
2556 'description': video_description,
2557 'categories': video_categories,
2558 'tags': video_tags,
2559 'subtitles': video_subtitles,
2560 'automatic_captions': automatic_captions,
2561 'duration': video_duration,
2562 'age_limit': 18 if age_gate else 0,
2563 'annotations': video_annotations,
2564 'chapters': chapters,
2565 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
2566 'view_count': view_count,
2567 'like_count': like_count,
2568 'dislike_count': dislike_count,
2569 'average_rating': average_rating,
2570 'formats': formats,
2571 'is_live': is_live,
2572 'start_time': start_time,
2573 'end_time': end_time,
2574 'series': series,
2575 'season_number': season_number,
2576 'episode_number': episode_number,
2577 'track': track,
2578 'artist': artist,
2579 'album': album,
2580 'release_date': release_date,
2581 'release_year': release_year,
2582 'subscriber_count': subscriber_count,
2583 }
2584
2585
2586 class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
2587 IE_DESC = 'YouTube.com playlists'
2588 _VALID_URL = r"""(?x)(?:
2589 (?:https?://)?
2590 (?:\w+\.)?
2591 (?:
2592 (?:
2593 youtube(?:kids)?\.com|
2594 invidio\.us
2595 )
2596 /
2597 (?:
2598 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))
2599 \? (?:.*?[&;])*? (?:p|a|list)=
2600 | p/
2601 )|
2602 youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
2603 )
2604 (
2605 (?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)?[0-9A-Za-z-_]{10,}
2606 # Top tracks, they can also include dots
2607 |(?:MC)[\w\.]*
2608 )
2609 .*
2610 |
2611 (%(playlist_id)s)
2612 )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
2613 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
2614 _VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&amp;(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?'
2615 _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})'
2616 IE_NAME = 'youtube:playlist'
2617 _TESTS = [{
2618 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2619 'info_dict': {
2620 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2621 'uploader': 'Sergey M.',
2622 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2623 'title': 'youtube-dl public playlist',
2624 },
2625 'playlist_count': 1,
2626 }, {
2627 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2628 'info_dict': {
2629 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2630 'uploader': 'Sergey M.',
2631 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2632 'title': 'youtube-dl empty playlist',
2633 },
2634 'playlist_count': 0,
2635 }, {
2636 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2637 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2638 'info_dict': {
2639 'title': '29C3: Not my department',
2640 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2641 'uploader': 'Christiaan008',
2642 'uploader_id': 'ChRiStIaAn008',
2643 },
2644 'playlist_count': 96,
2645 }, {
2646 'note': 'issue #673',
2647 'url': 'PLBB231211A4F62143',
2648 'info_dict': {
2649 'title': '[OLD]Team Fortress 2 (Class-based LP)',
2650 'id': 'PLBB231211A4F62143',
2651 'uploader': 'Wickydoo',
2652 'uploader_id': 'Wickydoo',
2653 },
2654 'playlist_mincount': 26,
2655 }, {
2656 'note': 'Large playlist',
2657 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
2658 'info_dict': {
2659 'title': 'Uploads from Cauchemar',
2660 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
2661 'uploader': 'Cauchemar',
2662 'uploader_id': 'Cauchemar89',
2663 },
2664 'playlist_mincount': 799,
2665 }, {
2666 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2667 'info_dict': {
2668 'title': 'YDL_safe_search',
2669 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2670 },
2671 'playlist_count': 2,
2672 'skip': 'This playlist is private',
2673 }, {
2674 'note': 'embedded',
2675 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
2676 'playlist_count': 4,
2677 'info_dict': {
2678 'title': 'JODA15',
2679 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
2680 'uploader': 'milan',
2681 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
2682 }
2683 }, {
2684 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2685 'playlist_mincount': 485,
2686 'info_dict': {
2687 'title': '2018 Chinese New Singles (11/6 updated)',
2688 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2689 'uploader': 'LBK',
2690 'uploader_id': 'sdragonfang',
2691 }
2692 }, {
2693 'note': 'Embedded SWF player',
2694 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
2695 'playlist_count': 4,
2696 'info_dict': {
2697 'title': 'JODA7',
2698 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
2699 },
2700 'skip': 'This playlist does not exist',
2701 }, {
2702 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2703 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2704 'info_dict': {
2705 'title': 'Uploads from Interstellar Movie',
2706 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
2707 'uploader': 'Interstellar Movie',
2708 'uploader_id': 'InterstellarMovie1',
2709 },
2710 'playlist_mincount': 21,
2711 }, {
2712 # Playlist URL that does not actually serve a playlist
2713 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2714 'info_dict': {
2715 'id': 'FqZTN594JQw',
2716 'ext': 'webm',
2717 'title': "Smiley's People 01 detective, Adventure Series, Action",
2718 'uploader': 'STREEM',
2719 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
2720 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
2721 'upload_date': '20150526',
2722 'license': 'Standard YouTube License',
2723 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2724 'categories': ['People & Blogs'],
2725 'tags': list,
2726 'view_count': int,
2727 'like_count': int,
2728 'dislike_count': int,
2729 },
2730 'params': {
2731 'skip_download': True,
2732 },
2733 'skip': 'This video is not available.',
2734 'add_ie': [YoutubeIE.ie_key()],
2735 }, {
2736 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
2737 'info_dict': {
2738 'id': 'yeWKywCrFtk',
2739 'ext': 'mp4',
2740 'title': 'Small Scale Baler and Braiding Rugs',
2741 'uploader': 'Backus-Page House Museum',
2742 'uploader_id': 'backuspagemuseum',
2743 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
2744 'upload_date': '20161008',
2745 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
2746 'categories': ['Nonprofits & Activism'],
2747 'tags': list,
2748 'like_count': int,
2749 'dislike_count': int,
2750 },
2751 'params': {
2752 'noplaylist': True,
2753 'skip_download': True,
2754 },
2755 }, {
2756 # https://github.com/ytdl-org/youtube-dl/issues/21844
2757 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2758 'info_dict': {
2759 'title': 'Data Analysis with Dr Mike Pound',
2760 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2761 'uploader_id': 'Computerphile',
2762 'uploader': 'Computerphile',
2763 },
2764 'playlist_mincount': 11,
2765 }, {
2766 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
2767 'only_matching': True,
2768 }, {
2769 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
2770 'only_matching': True,
2771 }, {
2772 # music album playlist
2773 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
2774 'only_matching': True,
2775 }, {
2776 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',
2777 'only_matching': True,
2778 }, {
2779 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
2780 'only_matching': True,
2781 }]
2782
2783 def _real_initialize(self):
2784 self._login()
2785
2786 def extract_videos_from_page(self, page):
2787 ids_in_page = []
2788 titles_in_page = []
2789
2790 for item in re.findall(
2791 r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page):
2792 attrs = extract_attributes(item)
2793 video_id = attrs['data-video-id']
2794 video_title = unescapeHTML(attrs.get('data-title'))
2795 if video_title:
2796 video_title = video_title.strip()
2797 ids_in_page.append(video_id)
2798 titles_in_page.append(video_title)
2799
2800 # Fallback with old _VIDEO_RE
2801 self.extract_videos_from_page_impl(
2802 self._VIDEO_RE, page, ids_in_page, titles_in_page)
2803
2804 # Relaxed fallbacks
2805 self.extract_videos_from_page_impl(
2806 r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page,
2807 ids_in_page, titles_in_page)
2808 self.extract_videos_from_page_impl(
2809 r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page,
2810 ids_in_page, titles_in_page)
2811
2812 return zip(ids_in_page, titles_in_page)
2813
2814 def _extract_mix(self, playlist_id):
2815 # The mixes are generated from a single video
2816 # the id of the playlist is just 'RD' + video_id
2817 ids = []
2818 last_id = playlist_id[-11:]
2819 for n in itertools.count(1):
2820 url = 'https://www.youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
2821 webpage = self._download_webpage(
2822 url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))
2823 new_ids = orderedSet(re.findall(
2824 r'''(?xs)data-video-username=".*?".*?
2825 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
2826 webpage))
2827 # Fetch new pages until all the videos are repeated, it seems that
2828 # there are always 51 unique videos.
2829 new_ids = [_id for _id in new_ids if _id not in ids]
2830 if not new_ids:
2831 break
2832 ids.extend(new_ids)
2833 last_id = ids[-1]
2834
2835 url_results = self._ids_to_results(ids)
2836
2837 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
2838 title_span = (
2839 search_title('playlist-title')
2840 or search_title('title long-title')
2841 or search_title('title'))
2842 title = clean_html(title_span)
2843
2844 return self.playlist_result(url_results, playlist_id, title)
2845
2846 def _extract_playlist(self, playlist_id):
2847 url = self._TEMPLATE_URL % playlist_id
2848 page = self._download_webpage(url, playlist_id)
2849
2850 # the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604)
2851 for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page):
2852 match = match.strip()
2853 # Check if the playlist exists or is private
2854 mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match)
2855 if mobj:
2856 reason = mobj.group('reason')
2857 message = 'This playlist %s' % reason
2858 if 'private' in reason:
2859 message += ', use --username or --netrc to access it'
2860 message += '.'
2861 raise ExtractorError(message, expected=True)
2862 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
2863 raise ExtractorError(
2864 'Invalid parameters. Maybe URL is incorrect.',
2865 expected=True)
2866 elif re.match(r'[^<]*Choose your language[^<]*', match):
2867 continue
2868 else:
2869 self.report_warning('Youtube gives an alert message: ' + match)
2870
2871 playlist_title = self._html_search_regex(
2872 r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
2873 page, 'title', default=None)
2874
2875 _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref='
2876 uploader = self._html_search_regex(
2877 r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,
2878 page, 'uploader', default=None)
2879 mobj = re.search(
2880 r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE,
2881 page)
2882 if mobj:
2883 uploader_id = mobj.group('uploader_id')
2884 uploader_url = compat_urlparse.urljoin(url, mobj.group('path'))
2885 else:
2886 uploader_id = uploader_url = None
2887
2888 has_videos = True
2889
2890 if not playlist_title:
2891 try:
2892 # Some playlist URLs don't actually serve a playlist (e.g.
2893 # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)
2894 next(self._entries(page, playlist_id))
2895 except StopIteration:
2896 has_videos = False
2897
2898 playlist = self.playlist_result(
2899 self._entries(page, playlist_id), playlist_id, playlist_title)
2900 playlist.update({
2901 'uploader': uploader,
2902 'uploader_id': uploader_id,
2903 'uploader_url': uploader_url,
2904 })
2905
2906 return has_videos, playlist
2907
2908 def _check_download_just_video(self, url, playlist_id):
2909 # Check if it's a video-specific URL
2910 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
2911 video_id = query_dict.get('v', [None])[0] or self._search_regex(
2912 r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url,
2913 'video id', default=None)
2914 if video_id:
2915 if self._downloader.params.get('noplaylist'):
2916 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
2917 return video_id, self.url_result(video_id, 'Youtube', video_id=video_id)
2918 else:
2919 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
2920 return video_id, None
2921 return None, None
2922
2923 def _real_extract(self, url):
2924 # Extract playlist id
2925 mobj = re.match(self._VALID_URL, url)
2926 if mobj is None:
2927 raise ExtractorError('Invalid URL: %s' % url)
2928 playlist_id = mobj.group(1) or mobj.group(2)
2929
2930 video_id, video = self._check_download_just_video(url, playlist_id)
2931 if video:
2932 return video
2933
2934 if playlist_id.startswith(('RD', 'UL', 'PU')):
2935 # Mixes require a custom extraction process
2936 return self._extract_mix(playlist_id)
2937
2938 has_videos, playlist = self._extract_playlist(playlist_id)
2939 if has_videos or not video_id:
2940 return playlist
2941
2942 # Some playlist URLs don't actually serve a playlist (see
2943 # https://github.com/ytdl-org/youtube-dl/issues/10537).
2944 # Fallback to plain video extraction if there is a video id
2945 # along with playlist id.
2946 return self.url_result(video_id, 'Youtube', video_id=video_id)
2947
2948
2949 class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
2950 IE_DESC = 'YouTube.com channels'
2951 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie|kids)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)'
2952 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
2953 _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
2954 IE_NAME = 'youtube:channel'
2955 _TESTS = [{
2956 'note': 'paginated channel',
2957 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
2958 'playlist_mincount': 91,
2959 'info_dict': {
2960 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
2961 'title': 'Uploads from lex will',
2962 'uploader': 'lex will',
2963 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2964 }
2965 }, {
2966 'note': 'Age restricted channel',
2967 # from https://www.youtube.com/user/DeusExOfficial
2968 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',
2969 'playlist_mincount': 64,
2970 'info_dict': {
2971 'id': 'UUs0ifCMCm1icqRbqhUINa0w',
2972 'title': 'Uploads from Deus Ex',
2973 'uploader': 'Deus Ex',
2974 'uploader_id': 'DeusExOfficial',
2975 },
2976 }, {
2977 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',
2978 'only_matching': True,
2979 }, {
2980 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA',
2981 'only_matching': True,
2982 }]
2983
2984 @classmethod
2985 def suitable(cls, url):
2986 return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)
2987 else super(YoutubeChannelIE, cls).suitable(url))
2988
2989 def _build_template_url(self, url, channel_id):
2990 return self._TEMPLATE_URL % channel_id
2991
2992 def _real_extract(self, url):
2993 channel_id = self._match_id(url)
2994
2995 url = self._build_template_url(url, channel_id)
2996
2997 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
2998 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
2999 # otherwise fallback on channel by page extraction
3000 channel_page = self._download_webpage(
3001 url + '?view=57', channel_id,
3002 'Downloading channel page', fatal=False)
3003 if channel_page is False:
3004 channel_playlist_id = False
3005 else:
3006 channel_playlist_id = self._html_search_meta(
3007 'channelId', channel_page, 'channel id', default=None)
3008 if not channel_playlist_id:
3009 channel_url = self._html_search_meta(
3010 ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),
3011 channel_page, 'channel url', default=None)
3012 if channel_url:
3013 channel_playlist_id = self._search_regex(
3014 r'vnd\.youtube://user/([0-9A-Za-z_-]+)',
3015 channel_url, 'channel id', default=None)
3016 if channel_playlist_id and channel_playlist_id.startswith('UC'):
3017 playlist_id = 'UU' + channel_playlist_id[2:]
3018 return self.url_result(
3019 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
3020
3021 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
3022 autogenerated = re.search(r'''(?x)
3023 class="[^"]*?(?:
3024 channel-header-autogenerated-label|
3025 yt-channel-title-autogenerated
3026 )[^"]*"''', channel_page) is not None
3027
3028 if autogenerated:
3029 # The videos are contained in a single page
3030 # the ajax pages can't be used, they are empty
3031 entries = [
3032 self.url_result(
3033 video_id, 'Youtube', video_id=video_id,
3034 video_title=video_title)
3035 for video_id, video_title in self.extract_videos_from_page(channel_page)]
3036 return self.playlist_result(entries, channel_id)
3037
3038 try:
3039 next(self._entries(channel_page, channel_id))
3040 except StopIteration:
3041 alert_message = self._html_search_regex(
3042 r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',
3043 channel_page, 'alert', default=None, group='alert')
3044 if alert_message:
3045 raise ExtractorError('Youtube said: %s' % alert_message, expected=True)
3046
3047 return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
3048
3049
3050 class YoutubeUserIE(YoutubeChannelIE):
3051 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
3052 _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9%-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_%-]+)'
3053 _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
3054 IE_NAME = 'youtube:user'
3055
3056 _TESTS = [{
3057 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
3058 'playlist_mincount': 320,
3059 'info_dict': {
3060 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
3061 'title': 'Uploads from The Linux Foundation',
3062 'uploader': 'The Linux Foundation',
3063 'uploader_id': 'TheLinuxFoundation',
3064 }
3065 }, {
3066 # Only available via https://www.youtube.com/c/12minuteathlete/videos
3067 # but not https://www.youtube.com/user/12minuteathlete/videos
3068 'url': 'https://www.youtube.com/c/12minuteathlete/videos',
3069 'playlist_mincount': 249,
3070 'info_dict': {
3071 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
3072 'title': 'Uploads from 12 Minute Athlete',
3073 'uploader': '12 Minute Athlete',
3074 'uploader_id': 'the12minuteathlete',
3075 }
3076 }, {
3077 'url': 'ytuser:phihag',
3078 'only_matching': True,
3079 }, {
3080 'url': 'https://www.youtube.com/c/gametrailers',
3081 'only_matching': True,
3082 }, {
3083 'url': 'https://www.youtube.com/c/Pawe%C5%82Zadro%C5%BCniak',
3084 'only_matching': True,
3085 }, {
3086 'url': 'https://www.youtube.com/gametrailers',
3087 'only_matching': True,
3088 }, {
3089 # This channel is not available, geo restricted to JP
3090 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',
3091 'only_matching': True,
3092 }]
3093
3094 @classmethod
3095 def suitable(cls, url):
3096 # Don't return True if the url can be extracted with other youtube
3097 # extractor, the regex would is too permissive and it would match.
3098 other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)
3099 if any(ie.suitable(url) for ie in other_yt_ies):
3100 return False
3101 else:
3102 return super(YoutubeUserIE, cls).suitable(url)
3103
3104 def _build_template_url(self, url, channel_id):
3105 mobj = re.match(self._VALID_URL, url)
3106 return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))
3107
3108
3109 class YoutubeLiveIE(YoutubeBaseInfoExtractor):
3110 IE_DESC = 'YouTube.com live streams'
3111 _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'
3112 IE_NAME = 'youtube:live'
3113
3114 _TESTS = [{
3115 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
3116 'info_dict': {
3117 'id': 'a48o2S1cPoo',
3118 'ext': 'mp4',
3119 'title': 'The Young Turks - Live Main Show',
3120 'uploader': 'The Young Turks',
3121 'uploader_id': 'TheYoungTurks',
3122 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
3123 'upload_date': '20150715',
3124 'license': 'Standard YouTube License',
3125 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
3126 'categories': ['News & Politics'],
3127 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
3128 'like_count': int,
3129 'dislike_count': int,
3130 },
3131 'params': {
3132 'skip_download': True,
3133 },
3134 }, {
3135 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
3136 'only_matching': True,
3137 }, {
3138 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
3139 'only_matching': True,
3140 }, {
3141 'url': 'https://www.youtube.com/TheYoungTurks/live',
3142 'only_matching': True,
3143 }]
3144
3145 def _real_extract(self, url):
3146 mobj = re.match(self._VALID_URL, url)
3147 channel_id = mobj.group('id')
3148 base_url = mobj.group('base_url')
3149 webpage = self._download_webpage(url, channel_id, fatal=False)
3150 if webpage:
3151 page_type = self._og_search_property(
3152 'type', webpage, 'page type', default='')
3153 video_id = self._html_search_meta(
3154 'videoId', webpage, 'video id', default=None)
3155 if page_type.startswith('video') and video_id and re.match(
3156 r'^[0-9A-Za-z_-]{11}$', video_id):
3157 return self.url_result(video_id, YoutubeIE.ie_key())
3158 return self.url_result(base_url)
3159
3160
3161 class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
3162 IE_DESC = 'YouTube.com user/channel playlists'
3163 _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel|c)/(?P<id>[^/]+)/playlists'
3164 IE_NAME = 'youtube:playlists'
3165
3166 _TESTS = [{
3167 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
3168 'playlist_mincount': 4,
3169 'info_dict': {
3170 'id': 'ThirstForScience',
3171 'title': 'ThirstForScience',
3172 },
3173 }, {
3174 # with "Load more" button
3175 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
3176 'playlist_mincount': 70,
3177 'info_dict': {
3178 'id': 'igorkle1',
3179 'title': 'Игорь Клейнер',
3180 },
3181 }, {
3182 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',
3183 'playlist_mincount': 17,
3184 'info_dict': {
3185 'id': 'UCiU1dHvZObB2iP6xkJ__Icw',
3186 'title': 'Chem Player',
3187 },
3188 'skip': 'Blocked',
3189 }, {
3190 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
3191 'only_matching': True,
3192 }]
3193
3194
3195 class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):
3196 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'
3197
3198
3199 class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
3200 IE_DESC = 'YouTube.com searches'
3201 # there doesn't appear to be a real limit, for example if you search for
3202 # 'python' you get more than 8.000.000 results
3203 _MAX_RESULTS = float('inf')
3204 IE_NAME = 'youtube:search'
3205 _SEARCH_KEY = 'ytsearch'
3206 _SEARCH_PARAMS = None
3207 _TESTS = []
3208
3209 def _entries(self, query, n):
3210 data = {
3211 'context': {
3212 'client': {
3213 'clientName': 'WEB',
3214 'clientVersion': '2.20201021.03.00',
3215 }
3216 },
3217 'query': query,
3218 }
3219 if self._SEARCH_PARAMS:
3220 data['params'] = self._SEARCH_PARAMS
3221 total = 0
3222 for page_num in itertools.count(1):
3223 search = self._download_json(
3224 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
3225 video_id='query "%s"' % query,
3226 note='Downloading page %s' % page_num,
3227 errnote='Unable to download API page', fatal=False,
3228 data=json.dumps(data).encode('utf8'),
3229 headers={'content-type': 'application/json'})
3230 if not search:
3231 break
3232 slr_contents = try_get(
3233 search,
3234 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
3235 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
3236 list)
3237 if not slr_contents:
3238 break
3239 isr_contents = try_get(
3240 slr_contents,
3241 lambda x: x[0]['itemSectionRenderer']['contents'],
3242 list)
3243 if not isr_contents:
3244 break
3245 for content in isr_contents:
3246 if not isinstance(content, dict):
3247 continue
3248 video = content.get('videoRenderer')
3249 if not isinstance(video, dict):
3250 continue
3251 video_id = video.get('videoId')
3252 if not video_id:
3253 continue
3254 title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str)
3255 description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str)
3256 duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str))
3257 view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
3258 view_count = int_or_none(self._search_regex(
3259 r'^(\d+)', re.sub(r'\s', '', view_count_text),
3260 'view count', default=None))
3261 uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
3262 total += 1
3263 yield {
3264 '_type': 'url_transparent',
3265 'ie_key': YoutubeIE.ie_key(),
3266 'id': video_id,
3267 'url': video_id,
3268 'title': title,
3269 'description': description,
3270 'duration': duration,
3271 'view_count': view_count,
3272 'uploader': uploader,
3273 }
3274 if total == n:
3275 return
3276 token = try_get(
3277 slr_contents,
3278 lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
3279 compat_str)
3280 if not token:
3281 break
3282 data['continuation'] = token
3283
3284 def _get_n_results(self, query, n):
3285 """Get a specified number of results for a query"""
3286 return self.playlist_result(self._entries(query, n), query)
3287
3288
3289 class YoutubeSearchDateIE(YoutubeSearchIE):
3290 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
3291 _SEARCH_KEY = 'ytsearchdate'
3292 IE_DESC = 'YouTube.com searches, newest videos first'
3293 _SEARCH_PARAMS = 'CAI%3D'
3294
3295
3296 class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):
3297 IE_DESC = 'YouTube.com search URLs'
3298 IE_NAME = 'youtube:search_url'
3299 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
3300 _SEARCH_DATA = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});'
3301 _TESTS = [{
3302 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
3303 'playlist_mincount': 5,
3304 'info_dict': {
3305 'title': 'youtube-dl test video',
3306 }
3307 }, {
3308 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
3309 'only_matching': True,
3310 }]
3311
3312 def _find_videos_in_json(self, extracted):
3313 videos = []
3314
3315 def _real_find(obj):
3316 if obj is None or isinstance(obj, str):
3317 return
3318
3319 if type(obj) is list:
3320 for elem in obj:
3321 _real_find(elem)
3322
3323 if type(obj) is dict:
3324 if "videoId" in obj:
3325 videos.append(obj)
3326 return
3327
3328 for _, o in obj.items():
3329 _real_find(o)
3330
3331 _real_find(extracted)
3332
3333 return videos
3334
3335 def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page):
3336 search_response = self._parse_json(self._search_regex(self._SEARCH_DATA, page, 'ytInitialData'), None)
3337
3338 result_items = self._find_videos_in_json(search_response)
3339
3340 for renderer in result_items:
3341 video_id = try_get(renderer, lambda x: x['videoId'])
3342 video_title = try_get(renderer, lambda x: x['title']['runs'][0]['text']) or try_get(renderer, lambda x: x['title']['simpleText'])
3343
3344 if video_id is None or video_title is None:
3345 # we do not have a videoRenderer or title extraction broke
3346 continue
3347
3348 video_title = video_title.strip()
3349
3350 try:
3351 idx = ids_in_page.index(video_id)
3352 if video_title and not titles_in_page[idx]:
3353 titles_in_page[idx] = video_title
3354 except ValueError:
3355 ids_in_page.append(video_id)
3356 titles_in_page.append(video_title)
3357
3358 def extract_videos_from_page(self, page):
3359 ids_in_page = []
3360 titles_in_page = []
3361 self.extract_videos_from_page_impl(page, ids_in_page, titles_in_page)
3362 return zip(ids_in_page, titles_in_page)
3363
3364 def _real_extract(self, url):
3365 mobj = re.match(self._VALID_URL, url)
3366 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
3367 webpage = self._download_webpage(url, query)
3368 return self.playlist_result(self._process_page(webpage), playlist_title=query)
3369
3370
3371 class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
3372 IE_DESC = 'YouTube.com (multi-season) shows'
3373 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
3374 IE_NAME = 'youtube:show'
3375 _TESTS = [{
3376 'url': 'https://www.youtube.com/show/airdisasters',
3377 'playlist_mincount': 5,
3378 'info_dict': {
3379 'id': 'airdisasters',
3380 'title': 'Air Disasters',
3381 }
3382 }]
3383
3384 def _real_extract(self, url):
3385 playlist_id = self._match_id(url)
3386 return super(YoutubeShowIE, self)._real_extract(
3387 'https://www.youtube.com/show/%s/playlists' % playlist_id)
3388
3389
3390 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
3391 """
3392 Base class for feed extractors
3393 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
3394 """
3395 _LOGIN_REQUIRED = True
3396 _FEED_DATA = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});'
3397 _YTCFG_DATA = r"ytcfg.set\(({.*?})\)"
3398
3399 @property
3400 def IE_NAME(self):
3401 return 'youtube:%s' % self._FEED_NAME
3402
3403 def _real_initialize(self):
3404 self._login()
3405
3406 def _find_videos_in_json(self, extracted):
3407 videos = []
3408 c = {}
3409
3410 def _real_find(obj):
3411 if obj is None or isinstance(obj, str):
3412 return
3413
3414 if type(obj) is list:
3415 for elem in obj:
3416 _real_find(elem)
3417
3418 if type(obj) is dict:
3419 if "videoId" in obj:
3420 videos.append(obj)
3421 return
3422
3423 if "nextContinuationData" in obj:
3424 c["continuation"] = obj["nextContinuationData"]
3425 return
3426
3427 for _, o in obj.items():
3428 _real_find(o)
3429
3430 _real_find(extracted)
3431
3432 return videos, try_get(c, lambda x: x["continuation"])
3433
3434 def _entries(self, page):
3435 info = []
3436
3437 yt_conf = self._parse_json(self._search_regex(self._YTCFG_DATA, page, 'ytcfg.set', default="null"), None, fatal=False)
3438
3439 search_response = self._parse_json(self._search_regex(self._FEED_DATA, page, 'ytInitialData'), None)
3440
3441 for page_num in itertools.count(1):
3442 video_info, continuation = self._find_videos_in_json(search_response)
3443
3444 new_info = []
3445
3446 for v in video_info:
3447 v_id = try_get(v, lambda x: x['videoId'])
3448 if not v_id:
3449 continue
3450
3451 have_video = False
3452 for old in info:
3453 if old['videoId'] == v_id:
3454 have_video = True
3455 break
3456
3457 if not have_video:
3458 new_info.append(v)
3459
3460 if not new_info:
3461 break
3462
3463 info.extend(new_info)
3464
3465 for video in new_info:
3466 yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['runs'][0]['text']) or try_get(video, lambda x: x['title']['simpleText']))
3467
3468 if not continuation or not yt_conf:
3469 break
3470
3471 search_response = self._download_json(
3472 'https://www.youtube.com/browse_ajax', self._PLAYLIST_TITLE,
3473 'Downloading page #%s' % page_num,
3474 transform_source=uppercase_escape,
3475 query={
3476 "ctoken": try_get(continuation, lambda x: x["continuation"]),
3477 "continuation": try_get(continuation, lambda x: x["continuation"]),
3478 "itct": try_get(continuation, lambda x: x["clickTrackingParams"])
3479 },
3480 headers={
3481 "X-YouTube-Client-Name": try_get(yt_conf, lambda x: x["INNERTUBE_CONTEXT_CLIENT_NAME"]),
3482 "X-YouTube-Client-Version": try_get(yt_conf, lambda x: x["INNERTUBE_CONTEXT_CLIENT_VERSION"]),
3483 "X-Youtube-Identity-Token": try_get(yt_conf, lambda x: x["ID_TOKEN"]),
3484 "X-YouTube-Device": try_get(yt_conf, lambda x: x["DEVICE"]),
3485 "X-YouTube-Page-CL": try_get(yt_conf, lambda x: x["PAGE_CL"]),
3486 "X-YouTube-Page-Label": try_get(yt_conf, lambda x: x["PAGE_BUILD_LABEL"]),
3487 "X-YouTube-Variants-Checksum": try_get(yt_conf, lambda x: x["VARIANTS_CHECKSUM"]),
3488 })
3489
3490 def _real_extract(self, url):
3491 page = self._download_webpage(
3492 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
3493 self._PLAYLIST_TITLE)
3494 return self.playlist_result(
3495 self._entries(page), playlist_title=self._PLAYLIST_TITLE)
3496
3497
3498 class YoutubeWatchLaterIE(YoutubePlaylistIE):
3499 IE_NAME = 'youtube:watchlater'
3500 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
3501 _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'
3502
3503 _TESTS = [{
3504 'url': 'https://www.youtube.com/playlist?list=WL',
3505 'only_matching': True,
3506 }, {
3507 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',
3508 'only_matching': True,
3509 }]
3510
3511 def _real_extract(self, url):
3512 _, video = self._check_download_just_video(url, 'WL')
3513 if video:
3514 return video
3515 _, playlist = self._extract_playlist('WL')
3516 return playlist
3517
3518
3519 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
3520 IE_NAME = 'youtube:favorites'
3521 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
3522 _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
3523 _LOGIN_REQUIRED = True
3524
3525 def _real_extract(self, url):
3526 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
3527 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
3528 return self.url_result(playlist_id, 'YoutubePlaylist')
3529
3530
3531 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3532 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
3533 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'
3534 _FEED_NAME = 'recommended'
3535 _PLAYLIST_TITLE = 'Youtube Recommended videos'
3536
3537
3538 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
3539 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
3540 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
3541 _FEED_NAME = 'subscriptions'
3542 _PLAYLIST_TITLE = 'Youtube Subscriptions'
3543
3544
3545 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
3546 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
3547 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
3548 _FEED_NAME = 'history'
3549 _PLAYLIST_TITLE = 'Youtube History'
3550
3551
3552 class YoutubeTruncatedURLIE(InfoExtractor):
3553 IE_NAME = 'youtube:truncated_url'
3554 IE_DESC = False # Do not list
3555 _VALID_URL = r'''(?x)
3556 (?:https?://)?
3557 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3558 (?:watch\?(?:
3559 feature=[a-z_]+|
3560 annotation_id=annotation_[^&]+|
3561 x-yt-cl=[0-9]+|
3562 hl=[^&]*|
3563 t=[0-9]+
3564 )?
3565 |
3566 attribution_link\?a=[^&]+
3567 )
3568 $
3569 '''
3570
3571 _TESTS = [{
3572 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
3573 'only_matching': True,
3574 }, {
3575 'url': 'https://www.youtube.com/watch?',
3576 'only_matching': True,
3577 }, {
3578 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3579 'only_matching': True,
3580 }, {
3581 'url': 'https://www.youtube.com/watch?feature=foo',
3582 'only_matching': True,
3583 }, {
3584 'url': 'https://www.youtube.com/watch?hl=en-GB',
3585 'only_matching': True,
3586 }, {
3587 'url': 'https://www.youtube.com/watch?t=2372',
3588 'only_matching': True,
3589 }]
3590
3591 def _real_extract(self, url):
3592 raise ExtractorError(
3593 'Did you forget to quote the URL? Remember that & is a meta '
3594 'character in most shells, so you want to put the URL in quotes, '
3595 'like youtube-dl '
3596 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3597 ' or simply youtube-dl BaW_jenozKc .',
3598 expected=True)
3599
3600
3601 class YoutubeTruncatedIDIE(InfoExtractor):
3602 IE_NAME = 'youtube:truncated_id'
3603 IE_DESC = False # Do not list
3604 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
3605
3606 _TESTS = [{
3607 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3608 'only_matching': True,
3609 }]
3610
3611 def _real_extract(self, url):
3612 video_id = self._match_id(url)
3613 raise ExtractorError(
3614 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3615 expected=True)