]> jfr.im git - yt-dlp.git/blob - youtube_dlc/extractor/youtube.py
merge youtube-dl master 22.09.2020
[yt-dlp.git] / youtube_dlc / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5
6 import itertools
7 import json
8 import os.path
9 import random
10 import re
11 import time
12 import traceback
13
14 from .common import InfoExtractor, SearchInfoExtractor
15 from ..jsinterp import JSInterpreter
16 from ..swfinterp import SWFInterpreter
17 from ..compat import (
18 compat_chr,
19 compat_HTTPError,
20 compat_kwargs,
21 compat_parse_qs,
22 compat_urllib_parse_unquote,
23 compat_urllib_parse_unquote_plus,
24 compat_urllib_parse_urlencode,
25 compat_urllib_parse_urlparse,
26 compat_urlparse,
27 compat_str,
28 )
29 from ..utils import (
30 bool_or_none,
31 clean_html,
32 error_to_compat_str,
33 extract_attributes,
34 ExtractorError,
35 float_or_none,
36 get_element_by_attribute,
37 get_element_by_id,
38 int_or_none,
39 mimetype2ext,
40 orderedSet,
41 parse_codecs,
42 parse_duration,
43 remove_quotes,
44 remove_start,
45 smuggle_url,
46 str_or_none,
47 str_to_int,
48 try_get,
49 unescapeHTML,
50 unified_strdate,
51 unsmuggle_url,
52 uppercase_escape,
53 url_or_none,
54 urlencode_postdata,
55 )
56
57
58 class YoutubeBaseInfoExtractor(InfoExtractor):
59 """Provide base functions for Youtube extractors"""
60 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
61 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
62
63 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
64 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
65 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
66
67 _NETRC_MACHINE = 'youtube'
68 # If True it will raise an error if no login info is provided
69 _LOGIN_REQUIRED = False
70
71 _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}'
72
73 _YOUTUBE_CLIENT_HEADERS = {
74 'x-youtube-client-name': '1',
75 'x-youtube-client-version': '1.20200609.04.02',
76 }
77
78 def _set_language(self):
79 self._set_cookie(
80 '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
81 # YouTube sets the expire time to about two months
82 expire_time=time.time() + 2 * 30 * 24 * 3600)
83
84 def _ids_to_results(self, ids):
85 return [
86 self.url_result(vid_id, 'Youtube', video_id=vid_id)
87 for vid_id in ids]
88
89 def _login(self):
90 """
91 Attempt to log in to YouTube.
92 True is returned if successful or skipped.
93 False is returned if login failed.
94
95 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
96 """
97 username, password = self._get_login_info()
98 # No authentication to be performed
99 if username is None:
100 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
101 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
102 return True
103
104 login_page = self._download_webpage(
105 self._LOGIN_URL, None,
106 note='Downloading login page',
107 errnote='unable to fetch login page', fatal=False)
108 if login_page is False:
109 return
110
111 login_form = self._hidden_inputs(login_page)
112
113 def req(url, f_req, note, errnote):
114 data = login_form.copy()
115 data.update({
116 'pstMsg': 1,
117 'checkConnection': 'youtube',
118 'checkedDomains': 'youtube',
119 'hl': 'en',
120 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
121 'f.req': json.dumps(f_req),
122 'flowName': 'GlifWebSignIn',
123 'flowEntry': 'ServiceLogin',
124 # TODO: reverse actual botguard identifier generation algo
125 'bgRequest': '["identifier",""]',
126 })
127 return self._download_json(
128 url, None, note=note, errnote=errnote,
129 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
130 fatal=False,
131 data=urlencode_postdata(data), headers={
132 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
133 'Google-Accounts-XSRF': 1,
134 })
135
136 def warn(message):
137 self._downloader.report_warning(message)
138
139 lookup_req = [
140 username,
141 None, [], None, 'US', None, None, 2, False, True,
142 [
143 None, None,
144 [2, 1, None, 1,
145 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
146 None, [], 4],
147 1, [None, None, []], None, None, None, True
148 ],
149 username,
150 ]
151
152 lookup_results = req(
153 self._LOOKUP_URL, lookup_req,
154 'Looking up account info', 'Unable to look up account info')
155
156 if lookup_results is False:
157 return False
158
159 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
160 if not user_hash:
161 warn('Unable to extract user hash')
162 return False
163
164 challenge_req = [
165 user_hash,
166 None, 1, None, [1, None, None, None, [password, None, True]],
167 [
168 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
169 1, [None, None, []], None, None, None, True
170 ]]
171
172 challenge_results = req(
173 self._CHALLENGE_URL, challenge_req,
174 'Logging in', 'Unable to log in')
175
176 if challenge_results is False:
177 return
178
179 login_res = try_get(challenge_results, lambda x: x[0][5], list)
180 if login_res:
181 login_msg = try_get(login_res, lambda x: x[5], compat_str)
182 warn(
183 'Unable to login: %s' % 'Invalid password'
184 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
185 return False
186
187 res = try_get(challenge_results, lambda x: x[0][-1], list)
188 if not res:
189 warn('Unable to extract result entry')
190 return False
191
192 login_challenge = try_get(res, lambda x: x[0][0], list)
193 if login_challenge:
194 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
195 if challenge_str == 'TWO_STEP_VERIFICATION':
196 # SEND_SUCCESS - TFA code has been successfully sent to phone
197 # QUOTA_EXCEEDED - reached the limit of TFA codes
198 status = try_get(login_challenge, lambda x: x[5], compat_str)
199 if status == 'QUOTA_EXCEEDED':
200 warn('Exceeded the limit of TFA codes, try later')
201 return False
202
203 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
204 if not tl:
205 warn('Unable to extract TL')
206 return False
207
208 tfa_code = self._get_tfa_info('2-step verification code')
209
210 if not tfa_code:
211 warn(
212 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
213 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
214 return False
215
216 tfa_code = remove_start(tfa_code, 'G-')
217
218 tfa_req = [
219 user_hash, None, 2, None,
220 [
221 9, None, None, None, None, None, None, None,
222 [None, tfa_code, True, 2]
223 ]]
224
225 tfa_results = req(
226 self._TFA_URL.format(tl), tfa_req,
227 'Submitting TFA code', 'Unable to submit TFA code')
228
229 if tfa_results is False:
230 return False
231
232 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
233 if tfa_res:
234 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
235 warn(
236 'Unable to finish TFA: %s' % 'Invalid TFA code'
237 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
238 return False
239
240 check_cookie_url = try_get(
241 tfa_results, lambda x: x[0][-1][2], compat_str)
242 else:
243 CHALLENGES = {
244 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
245 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
246 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
247 }
248 challenge = CHALLENGES.get(
249 challenge_str,
250 '%s returned error %s.' % (self.IE_NAME, challenge_str))
251 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
252 return False
253 else:
254 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
255
256 if not check_cookie_url:
257 warn('Unable to extract CheckCookie URL')
258 return False
259
260 check_cookie_results = self._download_webpage(
261 check_cookie_url, None, 'Checking cookie', fatal=False)
262
263 if check_cookie_results is False:
264 return False
265
266 if 'https://myaccount.google.com/' not in check_cookie_results:
267 warn('Unable to log in')
268 return False
269
270 return True
271
272 def _download_webpage_handle(self, *args, **kwargs):
273 query = kwargs.get('query', {}).copy()
274 query['disable_polymer'] = 'true'
275 kwargs['query'] = query
276 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
277 *args, **compat_kwargs(kwargs))
278
279 def _real_initialize(self):
280 if self._downloader is None:
281 return
282 self._set_language()
283 if not self._login():
284 return
285
286
287 class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
288 # Extract entries from page with "Load more" button
289 def _entries(self, page, playlist_id):
290 more_widget_html = content_html = page
291 for page_num in itertools.count(1):
292 for entry in self._process_page(content_html):
293 yield entry
294
295 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
296 if not mobj:
297 break
298
299 count = 0
300 retries = 3
301 while count <= retries:
302 try:
303 # Downloading page may result in intermittent 5xx HTTP error
304 # that is usually worked around with a retry
305 more = self._download_json(
306 'https://www.youtube.com/%s' % mobj.group('more'), playlist_id,
307 'Downloading page #%s%s'
308 % (page_num, ' (retry #%d)' % count if count else ''),
309 transform_source=uppercase_escape,
310 headers=self._YOUTUBE_CLIENT_HEADERS)
311 break
312 except ExtractorError as e:
313 if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
314 count += 1
315 if count <= retries:
316 continue
317 raise
318
319 content_html = more['content_html']
320 if not content_html.strip():
321 # Some webpages show a "Load more" button but they don't
322 # have more videos
323 break
324 more_widget_html = more['load_more_widget_html']
325
326
327 class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
328 def _process_page(self, content):
329 for video_id, video_title in self.extract_videos_from_page(content):
330 yield self.url_result(video_id, 'Youtube', video_id, video_title)
331
332 def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page):
333 for mobj in re.finditer(video_re, page):
334 # The link with index 0 is not the first video of the playlist (not sure if still actual)
335 if 'index' in mobj.groupdict() and mobj.group('id') == '0':
336 continue
337 video_id = mobj.group('id')
338 video_title = unescapeHTML(
339 mobj.group('title')) if 'title' in mobj.groupdict() else None
340 if video_title:
341 video_title = video_title.strip()
342 if video_title == '► Play all':
343 video_title = None
344 try:
345 idx = ids_in_page.index(video_id)
346 if video_title and not titles_in_page[idx]:
347 titles_in_page[idx] = video_title
348 except ValueError:
349 ids_in_page.append(video_id)
350 titles_in_page.append(video_title)
351
352 def extract_videos_from_page(self, page):
353 ids_in_page = []
354 titles_in_page = []
355 self.extract_videos_from_page_impl(
356 self._VIDEO_RE, page, ids_in_page, titles_in_page)
357 return zip(ids_in_page, titles_in_page)
358
359
360 class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
361 def _process_page(self, content):
362 for playlist_id in orderedSet(re.findall(
363 r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',
364 content)):
365 yield self.url_result(
366 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
367
368 def _real_extract(self, url):
369 playlist_id = self._match_id(url)
370 webpage = self._download_webpage(url, playlist_id)
371 title = self._og_search_title(webpage, fatal=False)
372 return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
373
374
375 class YoutubeIE(YoutubeBaseInfoExtractor):
376 IE_DESC = 'YouTube.com'
377 _VALID_URL = r"""(?x)^
378 (
379 (?:https?://|//) # http(s):// or protocol-independent URL
380 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
381 (?:www\.)?deturl\.com/www\.youtube\.com/|
382 (?:www\.)?pwnyoutube\.com/|
383 (?:www\.)?hooktube\.com/|
384 (?:www\.)?yourepeat\.com/|
385 tube\.majestyc\.net/|
386 # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
387 (?:(?:www|dev)\.)?invidio\.us/|
388 (?:(?:www|no)\.)?invidiou\.sh/|
389 (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/|
390 (?:www\.)?invidious\.kabi\.tk/|
391 (?:www\.)?invidious\.13ad\.de/|
392 (?:www\.)?invidious\.mastodon\.host/|
393 (?:www\.)?invidious\.nixnet\.xyz/|
394 (?:www\.)?invidious\.drycat\.fr/|
395 (?:www\.)?tube\.poal\.co/|
396 (?:www\.)?vid\.wxzm\.sx/|
397 (?:www\.)?yewtu\.be/|
398 (?:www\.)?yt\.elukerio\.org/|
399 (?:www\.)?yt\.lelux\.fi/|
400 (?:www\.)?invidious\.ggc-project\.de/|
401 (?:www\.)?yt\.maisputain\.ovh/|
402 (?:www\.)?invidious\.13ad\.de/|
403 (?:www\.)?invidious\.toot\.koeln/|
404 (?:www\.)?invidious\.fdn\.fr/|
405 (?:www\.)?watch\.nettohikari\.com/|
406 (?:www\.)?kgg2m7yk5aybusll\.onion/|
407 (?:www\.)?qklhadlycap4cnod\.onion/|
408 (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
409 (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
410 (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
411 (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
412 (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
413 (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
414 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
415 (?:.*?\#/)? # handle anchor (#/) redirect urls
416 (?: # the various things that can precede the ID:
417 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
418 |(?: # or the v= param in all its forms
419 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
420 (?:\?|\#!?) # the params delimiter ? or # or #!
421 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
422 v=
423 )
424 ))
425 |(?:
426 youtu\.be| # just youtu.be/xxxx
427 vid\.plus| # or vid.plus/xxxx
428 zwearz\.com/watch| # or zwearz.com/watch/xxxx
429 )/
430 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
431 )
432 )? # all until now is optional -> you can pass the naked ID
433 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
434 (?!.*?\blist=
435 (?:
436 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
437 WL # WL are handled by the watch later IE
438 )
439 )
440 (?(1).+)? # if we found the ID, everything can follow
441 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
442 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
443 _PLAYER_INFO_RE = (
444 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P<ext>[a-z]+)$',
445 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.(?P<ext>[a-z]+)$',
446 )
447 _formats = {
448 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
449 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
450 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
451 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
452 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
453 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
454 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
455 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
456 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
457 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
458 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
459 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
460 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
461 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
462 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
463 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
464 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
465 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
466
467
468 # 3D videos
469 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
470 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
471 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
472 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
473 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
474 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
475 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
476
477 # Apple HTTP Live Streaming
478 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
479 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
480 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
481 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
482 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
483 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
484 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
485 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
486
487 # DASH mp4 video
488 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
489 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
490 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
491 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
492 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
493 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
494 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
495 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
496 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
497 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
498 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
499 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
500
501 # Dash mp4 audio
502 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
503 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
504 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
505 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
506 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
507 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
508 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
509
510 # Dash webm
511 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
512 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
513 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
514 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
515 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
516 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
517 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
518 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
519 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
520 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
521 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
522 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
523 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
524 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
525 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
526 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
527 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
528 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
529 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
530 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
531 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
532 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
533
534 # Dash webm audio
535 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
536 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
537
538 # Dash webm audio with opus inside
539 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
540 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
541 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
542
543 # RTMP (unnamed)
544 '_rtmp': {'protocol': 'rtmp'},
545
546 # av01 video only formats sometimes served with "unknown" codecs
547 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
548 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
549 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
550 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
551 }
552 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
553
554 _GEO_BYPASS = False
555
556 IE_NAME = 'youtube'
557 _TESTS = [
558 {
559 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
560 'info_dict': {
561 'id': 'BaW_jenozKc',
562 'ext': 'mp4',
563 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
564 'uploader': 'Philipp Hagemeister',
565 'uploader_id': 'phihag',
566 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
567 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
568 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
569 'upload_date': '20121002',
570 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
571 'categories': ['Science & Technology'],
572 'tags': ['youtube-dl'],
573 'duration': 10,
574 'view_count': int,
575 'like_count': int,
576 'dislike_count': int,
577 'start_time': 1,
578 'end_time': 9,
579 }
580 },
581 {
582 'url': 'https://www.youtube.com/watch?v=UxxajLWwzqY',
583 'note': 'Test generic use_cipher_signature video (#897)',
584 'info_dict': {
585 'id': 'UxxajLWwzqY',
586 'ext': 'mp4',
587 'upload_date': '20120506',
588 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
589 'alt_title': 'I Love It (feat. Charli XCX)',
590 'description': 'md5:19a2f98d9032b9311e686ed039564f63',
591 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
592 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
593 'iconic ep', 'iconic', 'love', 'it'],
594 'duration': 180,
595 'uploader': 'Icona Pop',
596 'uploader_id': 'IconaPop',
597 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop',
598 'creator': 'Icona Pop',
599 'track': 'I Love It (feat. Charli XCX)',
600 'artist': 'Icona Pop',
601 }
602 },
603 {
604 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
605 'note': 'Test VEVO video with age protection (#956)',
606 'info_dict': {
607 'id': '07FYdnEawAQ',
608 'ext': 'mp4',
609 'upload_date': '20130703',
610 'title': 'Justin Timberlake - Tunnel Vision (Official Music Video) (Explicit)',
611 'alt_title': 'Tunnel Vision',
612 'description': 'md5:07dab3356cde4199048e4c7cd93471e1',
613 'duration': 419,
614 'uploader': 'justintimberlakeVEVO',
615 'uploader_id': 'justintimberlakeVEVO',
616 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',
617 'creator': 'Justin Timberlake',
618 'track': 'Tunnel Vision',
619 'artist': 'Justin Timberlake',
620 'age_limit': 18,
621 }
622 },
623 {
624 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
625 'note': 'Embed-only video (#1746)',
626 'info_dict': {
627 'id': 'yZIXLfi8CZQ',
628 'ext': 'mp4',
629 'upload_date': '20120608',
630 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
631 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
632 'uploader': 'SET India',
633 'uploader_id': 'setindia',
634 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
635 'age_limit': 18,
636 }
637 },
638 {
639 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
640 'note': 'Use the first video ID in the URL',
641 'info_dict': {
642 'id': 'BaW_jenozKc',
643 'ext': 'mp4',
644 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
645 'uploader': 'Philipp Hagemeister',
646 'uploader_id': 'phihag',
647 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
648 'upload_date': '20121002',
649 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
650 'categories': ['Science & Technology'],
651 'tags': ['youtube-dl'],
652 'duration': 10,
653 'view_count': int,
654 'like_count': int,
655 'dislike_count': int,
656 },
657 'params': {
658 'skip_download': True,
659 },
660 },
661 {
662 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
663 'note': '256k DASH audio (format 141) via DASH manifest',
664 'info_dict': {
665 'id': 'a9LDPn-MO4I',
666 'ext': 'm4a',
667 'upload_date': '20121002',
668 'uploader_id': '8KVIDEO',
669 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
670 'description': '',
671 'uploader': '8KVIDEO',
672 'title': 'UHDTV TEST 8K VIDEO.mp4'
673 },
674 'params': {
675 'youtube_include_dash_manifest': True,
676 'format': '141',
677 },
678 'skip': 'format 141 not served anymore',
679 },
680 # DASH manifest with encrypted signature
681 {
682 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
683 'info_dict': {
684 'id': 'IB3lcPjvWLA',
685 'ext': 'm4a',
686 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
687 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
688 'duration': 244,
689 'uploader': 'AfrojackVEVO',
690 'uploader_id': 'AfrojackVEVO',
691 'upload_date': '20131011',
692 },
693 'params': {
694 'youtube_include_dash_manifest': True,
695 'format': '141/bestaudio[ext=m4a]',
696 },
697 },
698 # JS player signature function name containing $
699 {
700 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
701 'info_dict': {
702 'id': 'nfWlot6h_JM',
703 'ext': 'm4a',
704 'title': 'Taylor Swift - Shake It Off',
705 'description': 'md5:307195cd21ff7fa352270fe884570ef0',
706 'duration': 242,
707 'uploader': 'TaylorSwiftVEVO',
708 'uploader_id': 'TaylorSwiftVEVO',
709 'upload_date': '20140818',
710 },
711 'params': {
712 'youtube_include_dash_manifest': True,
713 'format': '141/bestaudio[ext=m4a]',
714 },
715 },
716 # Controversy video
717 {
718 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
719 'info_dict': {
720 'id': 'T4XJQO3qol8',
721 'ext': 'mp4',
722 'duration': 219,
723 'upload_date': '20100909',
724 'uploader': 'Amazing Atheist',
725 'uploader_id': 'TheAmazingAtheist',
726 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
727 'title': 'Burning Everyone\'s Koran',
728 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
729 }
730 },
731 # Normal age-gate video (No vevo, embed allowed)
732 {
733 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
734 'info_dict': {
735 'id': 'HtVdAasjOgU',
736 'ext': 'mp4',
737 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
738 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
739 'duration': 142,
740 'uploader': 'The Witcher',
741 'uploader_id': 'WitcherGame',
742 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
743 'upload_date': '20140605',
744 'age_limit': 18,
745 },
746 },
747 # Age-gate video with encrypted signature
748 {
749 'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU',
750 'info_dict': {
751 'id': '6kLq3WMV1nU',
752 'ext': 'mp4',
753 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
754 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
755 'duration': 246,
756 'uploader': 'LloydVEVO',
757 'uploader_id': 'LloydVEVO',
758 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO',
759 'upload_date': '20110629',
760 'age_limit': 18,
761 },
762 },
763 # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
764 # YouTube Red ad is not captured for creator
765 {
766 'url': '__2ABJjxzNo',
767 'info_dict': {
768 'id': '__2ABJjxzNo',
769 'ext': 'mp4',
770 'duration': 266,
771 'upload_date': '20100430',
772 'uploader_id': 'deadmau5',
773 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
774 'creator': 'Dada Life, deadmau5',
775 'description': 'md5:12c56784b8032162bb936a5f76d55360',
776 'uploader': 'deadmau5',
777 'title': 'Deadmau5 - Some Chords (HD)',
778 'alt_title': 'This Machine Kills Some Chords',
779 },
780 'expected_warnings': [
781 'DASH manifest missing',
782 ]
783 },
784 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
785 {
786 'url': 'lqQg6PlCWgI',
787 'info_dict': {
788 'id': 'lqQg6PlCWgI',
789 'ext': 'mp4',
790 'duration': 6085,
791 'upload_date': '20150827',
792 'uploader_id': 'olympic',
793 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
794 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
795 'uploader': 'Olympic',
796 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
797 },
798 'params': {
799 'skip_download': 'requires avconv',
800 }
801 },
802 # Non-square pixels
803 {
804 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
805 'info_dict': {
806 'id': '_b-2C3KPAM0',
807 'ext': 'mp4',
808 'stretched_ratio': 16 / 9.,
809 'duration': 85,
810 'upload_date': '20110310',
811 'uploader_id': 'AllenMeow',
812 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
813 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
814 'uploader': '孫ᄋᄅ',
815 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
816 },
817 },
818 # url_encoded_fmt_stream_map is empty string
819 {
820 'url': 'qEJwOuvDf7I',
821 'info_dict': {
822 'id': 'qEJwOuvDf7I',
823 'ext': 'webm',
824 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
825 'description': '',
826 'upload_date': '20150404',
827 'uploader_id': 'spbelect',
828 'uploader': 'Наблюдатели Петербурга',
829 },
830 'params': {
831 'skip_download': 'requires avconv',
832 },
833 'skip': 'This live event has ended.',
834 },
835 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
836 {
837 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
838 'info_dict': {
839 'id': 'FIl7x6_3R5Y',
840 'ext': 'webm',
841 'title': 'md5:7b81415841e02ecd4313668cde88737a',
842 'description': 'md5:116377fd2963b81ec4ce64b542173306',
843 'duration': 220,
844 'upload_date': '20150625',
845 'uploader_id': 'dorappi2000',
846 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
847 'uploader': 'dorappi2000',
848 'formats': 'mincount:31',
849 },
850 'skip': 'not actual anymore',
851 },
852 # DASH manifest with segment_list
853 {
854 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
855 'md5': '8ce563a1d667b599d21064e982ab9e31',
856 'info_dict': {
857 'id': 'CsmdDsKjzN8',
858 'ext': 'mp4',
859 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
860 'uploader': 'Airtek',
861 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
862 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
863 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
864 },
865 'params': {
866 'youtube_include_dash_manifest': True,
867 'format': '135', # bestvideo
868 },
869 'skip': 'This live event has ended.',
870 },
871 {
872 # Multifeed videos (multiple cameras), URL is for Main Camera
873 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
874 'info_dict': {
875 'id': 'jqWvoWXjCVs',
876 'title': 'teamPGP: Rocket League Noob Stream',
877 'description': 'md5:dc7872fb300e143831327f1bae3af010',
878 },
879 'playlist': [{
880 'info_dict': {
881 'id': 'jqWvoWXjCVs',
882 'ext': 'mp4',
883 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
884 'description': 'md5:dc7872fb300e143831327f1bae3af010',
885 'duration': 7335,
886 'upload_date': '20150721',
887 'uploader': 'Beer Games Beer',
888 'uploader_id': 'beergamesbeer',
889 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
890 'license': 'Standard YouTube License',
891 },
892 }, {
893 'info_dict': {
894 'id': '6h8e8xoXJzg',
895 'ext': 'mp4',
896 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
897 'description': 'md5:dc7872fb300e143831327f1bae3af010',
898 'duration': 7337,
899 'upload_date': '20150721',
900 'uploader': 'Beer Games Beer',
901 'uploader_id': 'beergamesbeer',
902 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
903 'license': 'Standard YouTube License',
904 },
905 }, {
906 'info_dict': {
907 'id': 'PUOgX5z9xZw',
908 'ext': 'mp4',
909 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
910 'description': 'md5:dc7872fb300e143831327f1bae3af010',
911 'duration': 7337,
912 'upload_date': '20150721',
913 'uploader': 'Beer Games Beer',
914 'uploader_id': 'beergamesbeer',
915 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
916 'license': 'Standard YouTube License',
917 },
918 }, {
919 'info_dict': {
920 'id': 'teuwxikvS5k',
921 'ext': 'mp4',
922 'title': 'teamPGP: Rocket League Noob Stream (zim)',
923 'description': 'md5:dc7872fb300e143831327f1bae3af010',
924 'duration': 7334,
925 'upload_date': '20150721',
926 'uploader': 'Beer Games Beer',
927 'uploader_id': 'beergamesbeer',
928 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
929 'license': 'Standard YouTube License',
930 },
931 }],
932 'params': {
933 'skip_download': True,
934 },
935 'skip': 'This video is not available.',
936 },
937 {
938 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
939 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
940 'info_dict': {
941 'id': 'gVfLd0zydlo',
942 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
943 },
944 'playlist_count': 2,
945 'skip': 'Not multifeed anymore',
946 },
947 {
948 'url': 'https://vid.plus/FlRa-iH7PGw',
949 'only_matching': True,
950 },
951 {
952 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
953 'only_matching': True,
954 },
955 {
956 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
957 # Also tests cut-off URL expansion in video description (see
958 # https://github.com/ytdl-org/youtube-dl/issues/1892,
959 # https://github.com/ytdl-org/youtube-dl/issues/8164)
960 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
961 'info_dict': {
962 'id': 'lsguqyKfVQg',
963 'ext': 'mp4',
964 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
965 'alt_title': 'Dark Walk - Position Music',
966 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
967 'duration': 133,
968 'upload_date': '20151119',
969 'uploader_id': 'IronSoulElf',
970 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
971 'uploader': 'IronSoulElf',
972 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
973 'track': 'Dark Walk - Position Music',
974 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
975 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
976 },
977 'params': {
978 'skip_download': True,
979 },
980 },
981 {
982 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
983 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
984 'only_matching': True,
985 },
986 {
987 # Video with yt:stretch=17:0
988 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
989 'info_dict': {
990 'id': 'Q39EVAstoRM',
991 'ext': 'mp4',
992 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
993 'description': 'md5:ee18a25c350637c8faff806845bddee9',
994 'upload_date': '20151107',
995 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
996 'uploader': 'CH GAMER DROID',
997 },
998 'params': {
999 'skip_download': True,
1000 },
1001 'skip': 'This video does not exist.',
1002 },
1003 {
1004 # Video licensed under Creative Commons
1005 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
1006 'info_dict': {
1007 'id': 'M4gD1WSo5mA',
1008 'ext': 'mp4',
1009 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
1010 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
1011 'duration': 721,
1012 'upload_date': '20150127',
1013 'uploader_id': 'BerkmanCenter',
1014 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
1015 'uploader': 'The Berkman Klein Center for Internet & Society',
1016 'license': 'Creative Commons Attribution license (reuse allowed)',
1017 },
1018 'params': {
1019 'skip_download': True,
1020 },
1021 },
1022 {
1023 # Channel-like uploader_url
1024 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
1025 'info_dict': {
1026 'id': 'eQcmzGIKrzg',
1027 'ext': 'mp4',
1028 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
1029 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
1030 'duration': 4060,
1031 'upload_date': '20151119',
1032 'uploader': 'Bernie Sanders',
1033 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
1034 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
1035 'license': 'Creative Commons Attribution license (reuse allowed)',
1036 },
1037 'params': {
1038 'skip_download': True,
1039 },
1040 },
1041 {
1042 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
1043 'only_matching': True,
1044 },
1045 {
1046 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
1047 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
1048 'only_matching': True,
1049 },
1050 {
1051 # Rental video preview
1052 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
1053 'info_dict': {
1054 'id': 'uGpuVWrhIzE',
1055 'ext': 'mp4',
1056 'title': 'Piku - Trailer',
1057 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
1058 'upload_date': '20150811',
1059 'uploader': 'FlixMatrix',
1060 'uploader_id': 'FlixMatrixKaravan',
1061 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
1062 'license': 'Standard YouTube License',
1063 },
1064 'params': {
1065 'skip_download': True,
1066 },
1067 'skip': 'This video is not available.',
1068 },
1069 {
1070 # YouTube Red video with episode data
1071 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
1072 'info_dict': {
1073 'id': 'iqKdEhx-dD4',
1074 'ext': 'mp4',
1075 'title': 'Isolation - Mind Field (Ep 1)',
1076 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
1077 'duration': 2085,
1078 'upload_date': '20170118',
1079 'uploader': 'Vsauce',
1080 'uploader_id': 'Vsauce',
1081 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
1082 'series': 'Mind Field',
1083 'season_number': 1,
1084 'episode_number': 1,
1085 },
1086 'params': {
1087 'skip_download': True,
1088 },
1089 'expected_warnings': [
1090 'Skipping DASH manifest',
1091 ],
1092 },
1093 {
1094 # The following content has been identified by the YouTube community
1095 # as inappropriate or offensive to some audiences.
1096 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
1097 'info_dict': {
1098 'id': '6SJNVb0GnPI',
1099 'ext': 'mp4',
1100 'title': 'Race Differences in Intelligence',
1101 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
1102 'duration': 965,
1103 'upload_date': '20140124',
1104 'uploader': 'New Century Foundation',
1105 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
1106 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
1107 },
1108 'params': {
1109 'skip_download': True,
1110 },
1111 },
1112 {
1113 # itag 212
1114 'url': '1t24XAntNCY',
1115 'only_matching': True,
1116 },
1117 {
1118 # geo restricted to JP
1119 'url': 'sJL6WA-aGkQ',
1120 'only_matching': True,
1121 },
1122 {
1123 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
1124 'only_matching': True,
1125 },
1126 {
1127 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1128 'only_matching': True,
1129 },
1130 {
1131 # DRM protected
1132 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1133 'only_matching': True,
1134 },
1135 {
1136 # Video with unsupported adaptive stream type formats
1137 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1138 'info_dict': {
1139 'id': 'Z4Vy8R84T1U',
1140 'ext': 'mp4',
1141 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1142 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1143 'duration': 433,
1144 'upload_date': '20130923',
1145 'uploader': 'Amelia Putri Harwita',
1146 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1147 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1148 'formats': 'maxcount:10',
1149 },
1150 'params': {
1151 'skip_download': True,
1152 'youtube_include_dash_manifest': False,
1153 },
1154 'skip': 'not actual anymore',
1155 },
1156 {
1157 # Youtube Music Auto-generated description
1158 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1159 'info_dict': {
1160 'id': 'MgNrAu2pzNs',
1161 'ext': 'mp4',
1162 'title': 'Voyeur Girl',
1163 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1164 'upload_date': '20190312',
1165 'uploader': 'Stephen - Topic',
1166 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
1167 'artist': 'Stephen',
1168 'track': 'Voyeur Girl',
1169 'album': 'it\'s too much love to know my dear',
1170 'release_date': '20190313',
1171 'release_year': 2019,
1172 },
1173 'params': {
1174 'skip_download': True,
1175 },
1176 },
1177 {
1178 # Youtube Music Auto-generated description
1179 # Retrieve 'artist' field from 'Artist:' in video description
1180 # when it is present on youtube music video
1181 'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY',
1182 'info_dict': {
1183 'id': 'k0jLE7tTwjY',
1184 'ext': 'mp4',
1185 'title': 'Latch Feat. Sam Smith',
1186 'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335',
1187 'upload_date': '20150110',
1188 'uploader': 'Various Artists - Topic',
1189 'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w',
1190 'artist': 'Disclosure',
1191 'track': 'Latch Feat. Sam Smith',
1192 'album': 'Latch Featuring Sam Smith',
1193 'release_date': '20121008',
1194 'release_year': 2012,
1195 },
1196 'params': {
1197 'skip_download': True,
1198 },
1199 },
1200 {
1201 # Youtube Music Auto-generated description
1202 # handle multiple artists on youtube music video
1203 'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA',
1204 'info_dict': {
1205 'id': '74qn0eJSjpA',
1206 'ext': 'mp4',
1207 'title': 'Eastside',
1208 'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2',
1209 'upload_date': '20180710',
1210 'uploader': 'Benny Blanco - Topic',
1211 'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A',
1212 'artist': 'benny blanco, Halsey, Khalid',
1213 'track': 'Eastside',
1214 'album': 'Eastside',
1215 'release_date': '20180713',
1216 'release_year': 2018,
1217 },
1218 'params': {
1219 'skip_download': True,
1220 },
1221 },
1222 {
1223 # Youtube Music Auto-generated description
1224 # handle youtube music video with release_year and no release_date
1225 'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M',
1226 'info_dict': {
1227 'id': '-hcAI0g-f5M',
1228 'ext': 'mp4',
1229 'title': 'Put It On Me',
1230 'description': 'md5:f6422397c07c4c907c6638e1fee380a5',
1231 'upload_date': '20180426',
1232 'uploader': 'Matt Maeson - Topic',
1233 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ',
1234 'artist': 'Matt Maeson',
1235 'track': 'Put It On Me',
1236 'album': 'The Hearse',
1237 'release_date': None,
1238 'release_year': 2018,
1239 },
1240 'params': {
1241 'skip_download': True,
1242 },
1243 },
1244 {
1245 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1246 'only_matching': True,
1247 },
1248 {
1249 # invalid -> valid video id redirection
1250 'url': 'DJztXj2GPfl',
1251 'info_dict': {
1252 'id': 'DJztXj2GPfk',
1253 'ext': 'mp4',
1254 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1255 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1256 'upload_date': '20090125',
1257 'uploader': 'Prochorowka',
1258 'uploader_id': 'Prochorowka',
1259 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1260 'artist': 'Panjabi MC',
1261 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1262 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1263 },
1264 'params': {
1265 'skip_download': True,
1266 },
1267 },
1268 {
1269 # empty description results in an empty string
1270 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1271 'info_dict': {
1272 'id': 'x41yOUIvK2k',
1273 'ext': 'mp4',
1274 'title': 'IMG 3456',
1275 'description': '',
1276 'upload_date': '20170613',
1277 'uploader_id': 'ElevageOrVert',
1278 'uploader': 'ElevageOrVert',
1279 },
1280 'params': {
1281 'skip_download': True,
1282 },
1283 },
1284 ]
1285
1286 def __init__(self, *args, **kwargs):
1287 super(YoutubeIE, self).__init__(*args, **kwargs)
1288 self._player_cache = {}
1289
1290 def report_video_info_webpage_download(self, video_id):
1291 """Report attempt to download video info webpage."""
1292 self.to_screen('%s: Downloading video info webpage' % video_id)
1293
1294 def report_information_extraction(self, video_id):
1295 """Report attempt to extract video information."""
1296 self.to_screen('%s: Extracting video information' % video_id)
1297
1298 def report_unavailable_format(self, video_id, format):
1299 """Report extracted video URL."""
1300 self.to_screen('%s: Format %s not available' % (video_id, format))
1301
1302 def report_rtmp_download(self):
1303 """Indicate the download will use the RTMP protocol."""
1304 self.to_screen('RTMP download detected')
1305
1306 def _signature_cache_id(self, example_sig):
1307 """ Return a string representation of a signature """
1308 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
1309
1310 @classmethod
1311 def _extract_player_info(cls, player_url):
1312 for player_re in cls._PLAYER_INFO_RE:
1313 id_m = re.search(player_re, player_url)
1314 if id_m:
1315 break
1316 else:
1317 raise ExtractorError('Cannot identify player %r' % player_url)
1318 return id_m.group('ext'), id_m.group('id')
1319
1320 def _extract_signature_function(self, video_id, player_url, example_sig):
1321 player_type, player_id = self._extract_player_info(player_url)
1322
1323 # Read from filesystem cache
1324 func_id = '%s_%s_%s' % (
1325 player_type, player_id, self._signature_cache_id(example_sig))
1326 assert os.path.basename(func_id) == func_id
1327
1328 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
1329 if cache_spec is not None:
1330 return lambda s: ''.join(s[i] for i in cache_spec)
1331
1332 download_note = (
1333 'Downloading player %s' % player_url
1334 if self._downloader.params.get('verbose') else
1335 'Downloading %s player %s' % (player_type, player_id)
1336 )
1337 if player_type == 'js':
1338 code = self._download_webpage(
1339 player_url, video_id,
1340 note=download_note,
1341 errnote='Download of %s failed' % player_url)
1342 res = self._parse_sig_js(code)
1343 elif player_type == 'swf':
1344 urlh = self._request_webpage(
1345 player_url, video_id,
1346 note=download_note,
1347 errnote='Download of %s failed' % player_url)
1348 code = urlh.read()
1349 res = self._parse_sig_swf(code)
1350 else:
1351 assert False, 'Invalid player type %r' % player_type
1352
1353 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1354 cache_res = res(test_string)
1355 cache_spec = [ord(c) for c in cache_res]
1356
1357 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
1358 return res
1359
1360 def _print_sig_code(self, func, example_sig):
1361 def gen_sig_code(idxs):
1362 def _genslice(start, end, step):
1363 starts = '' if start == 0 else str(start)
1364 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
1365 steps = '' if step == 1 else (':%d' % step)
1366 return 's[%s%s%s]' % (starts, ends, steps)
1367
1368 step = None
1369 # Quelch pyflakes warnings - start will be set when step is set
1370 start = '(Never used)'
1371 for i, prev in zip(idxs[1:], idxs[:-1]):
1372 if step is not None:
1373 if i - prev == step:
1374 continue
1375 yield _genslice(start, prev, step)
1376 step = None
1377 continue
1378 if i - prev in [-1, 1]:
1379 step = i - prev
1380 start = prev
1381 continue
1382 else:
1383 yield 's[%d]' % prev
1384 if step is None:
1385 yield 's[%d]' % i
1386 else:
1387 yield _genslice(start, i, step)
1388
1389 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1390 cache_res = func(test_string)
1391 cache_spec = [ord(c) for c in cache_res]
1392 expr_code = ' + '.join(gen_sig_code(cache_spec))
1393 signature_id_tuple = '(%s)' % (
1394 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
1395 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
1396 ' return %s\n') % (signature_id_tuple, expr_code)
1397 self.to_screen('Extracted signature function:\n' + code)
1398
1399 def _parse_sig_js(self, jscode):
1400 funcname = self._search_regex(
1401 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1402 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1403 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1404 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1405 # Obsolete patterns
1406 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1407 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
1408 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1409 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1410 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1411 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1412 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1413 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
1414 jscode, 'Initial JS player signature function name', group='sig')
1415
1416 jsi = JSInterpreter(jscode)
1417 initial_function = jsi.extract_function(funcname)
1418 return lambda s: initial_function([s])
1419
1420 def _parse_sig_swf(self, file_contents):
1421 swfi = SWFInterpreter(file_contents)
1422 TARGET_CLASSNAME = 'SignatureDecipher'
1423 searched_class = swfi.extract_class(TARGET_CLASSNAME)
1424 initial_function = swfi.extract_function(searched_class, 'decipher')
1425 return lambda s: initial_function([s])
1426
1427 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1428 """Turn the encrypted s field into a working signature"""
1429
1430 if player_url is None:
1431 raise ExtractorError('Cannot decrypt signature without player_url')
1432
1433 if player_url.startswith('//'):
1434 player_url = 'https:' + player_url
1435 elif not re.match(r'https?://', player_url):
1436 player_url = compat_urlparse.urljoin(
1437 'https://www.youtube.com', player_url)
1438 try:
1439 player_id = (player_url, self._signature_cache_id(s))
1440 if player_id not in self._player_cache:
1441 func = self._extract_signature_function(
1442 video_id, player_url, s
1443 )
1444 self._player_cache[player_id] = func
1445 func = self._player_cache[player_id]
1446 if self._downloader.params.get('youtube_print_sig_code'):
1447 self._print_sig_code(func, s)
1448 return func(s)
1449 except Exception as e:
1450 tb = traceback.format_exc()
1451 raise ExtractorError(
1452 'Signature extraction failed: ' + tb, cause=e)
1453
1454 def _get_subtitles(self, video_id, webpage, has_live_chat_replay):
1455 try:
1456 subs_doc = self._download_xml(
1457 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1458 video_id, note=False)
1459 except ExtractorError as err:
1460 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
1461 return {}
1462
1463 sub_lang_list = {}
1464 for track in subs_doc.findall('track'):
1465 lang = track.attrib['lang_code']
1466 if lang in sub_lang_list:
1467 continue
1468 sub_formats = []
1469 for ext in self._SUBTITLE_FORMATS:
1470 params = compat_urllib_parse_urlencode({
1471 'lang': lang,
1472 'v': video_id,
1473 'fmt': ext,
1474 'name': track.attrib['name'].encode('utf-8'),
1475 })
1476 sub_formats.append({
1477 'url': 'https://www.youtube.com/api/timedtext?' + params,
1478 'ext': ext,
1479 })
1480 sub_lang_list[lang] = sub_formats
1481 if has_live_chat_replay:
1482 sub_lang_list['live_chat'] = [
1483 {
1484 'video_id': video_id,
1485 'ext': 'json',
1486 'protocol': 'youtube_live_chat_replay',
1487 },
1488 ]
1489 if not sub_lang_list:
1490 self._downloader.report_warning('video doesn\'t have subtitles')
1491 return {}
1492 return sub_lang_list
1493
1494 def _get_ytplayer_config(self, video_id, webpage):
1495 patterns = (
1496 # User data may contain arbitrary character sequences that may affect
1497 # JSON extraction with regex, e.g. when '};' is contained the second
1498 # regex won't capture the whole JSON. Yet working around by trying more
1499 # concrete regex first keeping in mind proper quoted string handling
1500 # to be implemented in future that will replace this workaround (see
1501 # https://github.com/ytdl-org/youtube-dl/issues/7468,
1502 # https://github.com/ytdl-org/youtube-dl/pull/7599)
1503 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1504 r';ytplayer\.config\s*=\s*({.+?});',
1505 )
1506 config = self._search_regex(
1507 patterns, webpage, 'ytplayer.config', default=None)
1508 if config:
1509 return self._parse_json(
1510 uppercase_escape(config), video_id, fatal=False)
1511
1512 def _get_yt_initial_data(self, video_id, webpage):
1513 config = self._search_regex(
1514 (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
1515 r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
1516 webpage, 'ytInitialData', default=None)
1517 if config:
1518 return self._parse_json(
1519 uppercase_escape(config), video_id, fatal=False)
1520
1521 def _get_automatic_captions(self, video_id, webpage):
1522 """We need the webpage for getting the captions url, pass it as an
1523 argument to speed up the process."""
1524 self.to_screen('%s: Looking for automatic captions' % video_id)
1525 player_config = self._get_ytplayer_config(video_id, webpage)
1526 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
1527 if not player_config:
1528 self._downloader.report_warning(err_msg)
1529 return {}
1530 try:
1531 args = player_config['args']
1532 caption_url = args.get('ttsurl')
1533 if caption_url:
1534 timestamp = args['timestamp']
1535 # We get the available subtitles
1536 list_params = compat_urllib_parse_urlencode({
1537 'type': 'list',
1538 'tlangs': 1,
1539 'asrs': 1,
1540 })
1541 list_url = caption_url + '&' + list_params
1542 caption_list = self._download_xml(list_url, video_id)
1543 original_lang_node = caption_list.find('track')
1544 if original_lang_node is None:
1545 self._downloader.report_warning('Video doesn\'t have automatic captions')
1546 return {}
1547 original_lang = original_lang_node.attrib['lang_code']
1548 caption_kind = original_lang_node.attrib.get('kind', '')
1549
1550 sub_lang_list = {}
1551 for lang_node in caption_list.findall('target'):
1552 sub_lang = lang_node.attrib['lang_code']
1553 sub_formats = []
1554 for ext in self._SUBTITLE_FORMATS:
1555 params = compat_urllib_parse_urlencode({
1556 'lang': original_lang,
1557 'tlang': sub_lang,
1558 'fmt': ext,
1559 'ts': timestamp,
1560 'kind': caption_kind,
1561 })
1562 sub_formats.append({
1563 'url': caption_url + '&' + params,
1564 'ext': ext,
1565 })
1566 sub_lang_list[sub_lang] = sub_formats
1567 return sub_lang_list
1568
1569 def make_captions(sub_url, sub_langs):
1570 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1571 caption_qs = compat_parse_qs(parsed_sub_url.query)
1572 captions = {}
1573 for sub_lang in sub_langs:
1574 sub_formats = []
1575 for ext in self._SUBTITLE_FORMATS:
1576 caption_qs.update({
1577 'tlang': [sub_lang],
1578 'fmt': [ext],
1579 })
1580 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1581 query=compat_urllib_parse_urlencode(caption_qs, True)))
1582 sub_formats.append({
1583 'url': sub_url,
1584 'ext': ext,
1585 })
1586 captions[sub_lang] = sub_formats
1587 return captions
1588
1589 # New captions format as of 22.06.2017
1590 player_response = args.get('player_response')
1591 if player_response and isinstance(player_response, compat_str):
1592 player_response = self._parse_json(
1593 player_response, video_id, fatal=False)
1594 if player_response:
1595 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1596 caption_tracks = renderer['captionTracks']
1597 for caption_track in caption_tracks:
1598 if 'kind' not in caption_track:
1599 # not an automatic transcription
1600 continue
1601 base_url = caption_track['baseUrl']
1602 sub_lang_list = []
1603 for lang in renderer['translationLanguages']:
1604 lang_code = lang.get('languageCode')
1605 if lang_code:
1606 sub_lang_list.append(lang_code)
1607 return make_captions(base_url, sub_lang_list)
1608
1609 self._downloader.report_warning("Couldn't find automatic captions for %s" % video_id)
1610 return {}
1611 # Some videos don't provide ttsurl but rather caption_tracks and
1612 # caption_translation_languages (e.g. 20LmZk1hakA)
1613 # Does not used anymore as of 22.06.2017
1614 caption_tracks = args['caption_tracks']
1615 caption_translation_languages = args['caption_translation_languages']
1616 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
1617 sub_lang_list = []
1618 for lang in caption_translation_languages.split(','):
1619 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1620 sub_lang = lang_qs.get('lc', [None])[0]
1621 if sub_lang:
1622 sub_lang_list.append(sub_lang)
1623 return make_captions(caption_url, sub_lang_list)
1624 # An extractor error can be raise by the download process if there are
1625 # no automatic captions but there are subtitles
1626 except (KeyError, IndexError, ExtractorError):
1627 self._downloader.report_warning(err_msg)
1628 return {}
1629
1630 def _mark_watched(self, video_id, video_info, player_response):
1631 playback_url = url_or_none(try_get(
1632 player_response,
1633 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
1634 video_info, lambda x: x['videostats_playback_base_url'][0]))
1635 if not playback_url:
1636 return
1637 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1638 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1639
1640 # cpn generation algorithm is reverse engineered from base.js.
1641 # In fact it works even with dummy cpn.
1642 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1643 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1644
1645 qs.update({
1646 'ver': ['2'],
1647 'cpn': [cpn],
1648 })
1649 playback_url = compat_urlparse.urlunparse(
1650 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
1651
1652 self._download_webpage(
1653 playback_url, video_id, 'Marking watched',
1654 'Unable to mark watched', fatal=False)
1655
1656 @staticmethod
1657 def _extract_urls(webpage):
1658 # Embedded YouTube player
1659 entries = [
1660 unescapeHTML(mobj.group('url'))
1661 for mobj in re.finditer(r'''(?x)
1662 (?:
1663 <iframe[^>]+?src=|
1664 data-video-url=|
1665 <embed[^>]+?src=|
1666 embedSWF\(?:\s*|
1667 <object[^>]+data=|
1668 new\s+SWFObject\(
1669 )
1670 (["\'])
1671 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1672 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
1673 \1''', webpage)]
1674
1675 # lazyYT YouTube embed
1676 entries.extend(list(map(
1677 unescapeHTML,
1678 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1679
1680 # Wordpress "YouTube Video Importer" plugin
1681 matches = re.findall(r'''(?x)<div[^>]+
1682 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1683 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1684 entries.extend(m[-1] for m in matches)
1685
1686 return entries
1687
1688 @staticmethod
1689 def _extract_url(webpage):
1690 urls = YoutubeIE._extract_urls(webpage)
1691 return urls[0] if urls else None
1692
1693 @classmethod
1694 def extract_id(cls, url):
1695 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1696 if mobj is None:
1697 raise ExtractorError('Invalid URL: %s' % url)
1698 video_id = mobj.group(2)
1699 return video_id
1700
1701 def _extract_chapters_from_json(self, webpage, video_id, duration):
1702 if not webpage:
1703 return
1704 initial_data = self._parse_json(
1705 self._search_regex(
1706 r'window\["ytInitialData"\] = (.+);\n', webpage,
1707 'player args', default='{}'),
1708 video_id, fatal=False)
1709 if not initial_data or not isinstance(initial_data, dict):
1710 return
1711 chapters_list = try_get(
1712 initial_data,
1713 lambda x: x['playerOverlays']
1714 ['playerOverlayRenderer']
1715 ['decoratedPlayerBarRenderer']
1716 ['decoratedPlayerBarRenderer']
1717 ['playerBar']
1718 ['chapteredPlayerBarRenderer']
1719 ['chapters'],
1720 list)
1721 if not chapters_list:
1722 return
1723
1724 def chapter_time(chapter):
1725 return float_or_none(
1726 try_get(
1727 chapter,
1728 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1729 int),
1730 scale=1000)
1731 chapters = []
1732 for next_num, chapter in enumerate(chapters_list, start=1):
1733 start_time = chapter_time(chapter)
1734 if start_time is None:
1735 continue
1736 end_time = (chapter_time(chapters_list[next_num])
1737 if next_num < len(chapters_list) else duration)
1738 if end_time is None:
1739 continue
1740 title = try_get(
1741 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1742 compat_str)
1743 chapters.append({
1744 'start_time': start_time,
1745 'end_time': end_time,
1746 'title': title,
1747 })
1748 return chapters
1749
1750 @staticmethod
1751 def _extract_chapters_from_description(description, duration):
1752 if not description:
1753 return None
1754 chapter_lines = re.findall(
1755 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1756 description)
1757 if not chapter_lines:
1758 return None
1759 chapters = []
1760 for next_num, (chapter_line, time_point) in enumerate(
1761 chapter_lines, start=1):
1762 start_time = parse_duration(time_point)
1763 if start_time is None:
1764 continue
1765 if start_time > duration:
1766 break
1767 end_time = (duration if next_num == len(chapter_lines)
1768 else parse_duration(chapter_lines[next_num][1]))
1769 if end_time is None:
1770 continue
1771 if end_time > duration:
1772 end_time = duration
1773 if start_time > end_time:
1774 break
1775 chapter_title = re.sub(
1776 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1777 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1778 chapters.append({
1779 'start_time': start_time,
1780 'end_time': end_time,
1781 'title': chapter_title,
1782 })
1783 return chapters
1784
1785 def _extract_chapters(self, webpage, description, video_id, duration):
1786 return (self._extract_chapters_from_json(webpage, video_id, duration)
1787 or self._extract_chapters_from_description(description, duration))
1788
1789 def _real_extract(self, url):
1790 url, smuggled_data = unsmuggle_url(url, {})
1791
1792 proto = (
1793 'http' if self._downloader.params.get('prefer_insecure', False)
1794 else 'https')
1795
1796 start_time = None
1797 end_time = None
1798 parsed_url = compat_urllib_parse_urlparse(url)
1799 for component in [parsed_url.fragment, parsed_url.query]:
1800 query = compat_parse_qs(component)
1801 if start_time is None and 't' in query:
1802 start_time = parse_duration(query['t'][0])
1803 if start_time is None and 'start' in query:
1804 start_time = parse_duration(query['start'][0])
1805 if end_time is None and 'end' in query:
1806 end_time = parse_duration(query['end'][0])
1807
1808 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1809 mobj = re.search(self._NEXT_URL_RE, url)
1810 if mobj:
1811 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
1812 video_id = self.extract_id(url)
1813
1814 # Get video webpage
1815 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
1816 video_webpage, urlh = self._download_webpage_handle(url, video_id)
1817
1818 qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query)
1819 video_id = qs.get('v', [None])[0] or video_id
1820
1821 # Attempt to extract SWF player URL
1822 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1823 if mobj is not None:
1824 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1825 else:
1826 player_url = None
1827
1828 dash_mpds = []
1829
1830 def add_dash_mpd(video_info):
1831 dash_mpd = video_info.get('dashmpd')
1832 if dash_mpd and dash_mpd[0] not in dash_mpds:
1833 dash_mpds.append(dash_mpd[0])
1834
1835 def add_dash_mpd_pr(pl_response):
1836 dash_mpd = url_or_none(try_get(
1837 pl_response, lambda x: x['streamingData']['dashManifestUrl'],
1838 compat_str))
1839 if dash_mpd and dash_mpd not in dash_mpds:
1840 dash_mpds.append(dash_mpd)
1841
1842 is_live = None
1843 view_count = None
1844
1845 def extract_view_count(v_info):
1846 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1847
1848 def extract_player_response(player_response, video_id):
1849 pl_response = str_or_none(player_response)
1850 if not pl_response:
1851 return
1852 pl_response = self._parse_json(pl_response, video_id, fatal=False)
1853 if isinstance(pl_response, dict):
1854 add_dash_mpd_pr(pl_response)
1855 return pl_response
1856
1857 player_response = {}
1858
1859 # Get video info
1860 video_info = {}
1861 embed_webpage = None
1862 if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+'
1863 or re.search(r'player-age-gate-content">', video_webpage) is not None):
1864 age_gate = True
1865 # We simulate the access to the video from www.youtube.com/v/{video_id}
1866 # this can be viewed without login into Youtube
1867 url = proto + '://www.youtube.com/embed/%s' % video_id
1868 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
1869 data = compat_urllib_parse_urlencode({
1870 'video_id': video_id,
1871 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1872 'sts': self._search_regex(
1873 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1874 })
1875 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1876 try:
1877 video_info_webpage = self._download_webpage(
1878 video_info_url, video_id,
1879 note='Refetching age-gated info webpage',
1880 errnote='unable to download video info webpage')
1881 except ExtractorError:
1882 video_info_webpage = None
1883 if video_info_webpage:
1884 video_info = compat_parse_qs(video_info_webpage)
1885 pl_response = video_info.get('player_response', [None])[0]
1886 player_response = extract_player_response(pl_response, video_id)
1887 add_dash_mpd(video_info)
1888 view_count = extract_view_count(video_info)
1889 else:
1890 age_gate = False
1891 # Try looking directly into the video webpage
1892 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1893 if ytplayer_config:
1894 args = ytplayer_config['args']
1895 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1896 # Convert to the same format returned by compat_parse_qs
1897 video_info = dict((k, [v]) for k, v in args.items())
1898 add_dash_mpd(video_info)
1899 # Rental video is not rented but preview is available (e.g.
1900 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1901 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1902 if not video_info and args.get('ypc_vid'):
1903 return self.url_result(
1904 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1905 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1906 is_live = True
1907 if not player_response:
1908 player_response = extract_player_response(args.get('player_response'), video_id)
1909 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1910 add_dash_mpd_pr(player_response)
1911
1912 def extract_unavailable_message():
1913 messages = []
1914 for tag, kind in (('h1', 'message'), ('div', 'submessage')):
1915 msg = self._html_search_regex(
1916 r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
1917 video_webpage, 'unavailable %s' % kind, default=None)
1918 if msg:
1919 messages.append(msg)
1920 if messages:
1921 return '\n'.join(messages)
1922
1923 if not video_info and not player_response:
1924 unavailable_message = extract_unavailable_message()
1925 if not unavailable_message:
1926 unavailable_message = 'Unable to extract video data'
1927 raise ExtractorError(
1928 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
1929
1930 if not isinstance(video_info, dict):
1931 video_info = {}
1932
1933 video_details = try_get(
1934 player_response, lambda x: x['videoDetails'], dict) or {}
1935
1936 microformat = try_get(
1937 player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}
1938
1939 video_title = video_info.get('title', [None])[0] or video_details.get('title')
1940 if not video_title:
1941 self._downloader.report_warning('Unable to extract video title')
1942 video_title = '_'
1943
1944 description_original = video_description = get_element_by_id("eow-description", video_webpage)
1945 if video_description:
1946
1947 def replace_url(m):
1948 redir_url = compat_urlparse.urljoin(url, m.group(1))
1949 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1950 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1951 qs = compat_parse_qs(parsed_redir_url.query)
1952 q = qs.get('q')
1953 if q and q[0]:
1954 return q[0]
1955 return redir_url
1956
1957 description_original = video_description = re.sub(r'''(?x)
1958 <a\s+
1959 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1960 (?:title|href)="([^"]+)"\s+
1961 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1962 class="[^"]*"[^>]*>
1963 [^<]+\.{3}\s*
1964 </a>
1965 ''', replace_url, video_description)
1966 video_description = clean_html(video_description)
1967 else:
1968 video_description = video_details.get('shortDescription')
1969 if video_description is None:
1970 video_description = self._html_search_meta('description', video_webpage)
1971
1972 if not smuggled_data.get('force_singlefeed', False):
1973 if not self._downloader.params.get('noplaylist'):
1974 multifeed_metadata_list = try_get(
1975 player_response,
1976 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
1977 compat_str) or try_get(
1978 video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
1979 if multifeed_metadata_list:
1980 entries = []
1981 feed_ids = []
1982 for feed in multifeed_metadata_list.split(','):
1983 # Unquote should take place before split on comma (,) since textual
1984 # fields may contain comma as well (see
1985 # https://github.com/ytdl-org/youtube-dl/issues/8536)
1986 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
1987
1988 def feed_entry(name):
1989 return try_get(feed_data, lambda x: x[name][0], compat_str)
1990
1991 feed_id = feed_entry('id')
1992 if not feed_id:
1993 continue
1994 feed_title = feed_entry('title')
1995 title = video_title
1996 if feed_title:
1997 title += ' (%s)' % feed_title
1998 entries.append({
1999 '_type': 'url_transparent',
2000 'ie_key': 'Youtube',
2001 'url': smuggle_url(
2002 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
2003 {'force_singlefeed': True}),
2004 'title': title,
2005 })
2006 feed_ids.append(feed_id)
2007 self.to_screen(
2008 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
2009 % (', '.join(feed_ids), video_id))
2010 return self.playlist_result(entries, video_id, video_title, video_description)
2011 else:
2012 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
2013
2014 if view_count is None:
2015 view_count = extract_view_count(video_info)
2016 if view_count is None and video_details:
2017 view_count = int_or_none(video_details.get('viewCount'))
2018 if view_count is None and microformat:
2019 view_count = int_or_none(microformat.get('viewCount'))
2020
2021 if is_live is None:
2022 is_live = bool_or_none(video_details.get('isLive'))
2023
2024 has_live_chat_replay = False
2025 if not is_live:
2026 yt_initial_data = self._get_yt_initial_data(video_id, video_webpage)
2027 try:
2028 yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
2029 has_live_chat_replay = True
2030 except (KeyError, IndexError, TypeError):
2031 pass
2032
2033 # Check for "rental" videos
2034 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
2035 raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
2036
2037 def _extract_filesize(media_url):
2038 return int_or_none(self._search_regex(
2039 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
2040
2041 streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
2042 streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
2043
2044 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
2045 self.report_rtmp_download()
2046 formats = [{
2047 'format_id': '_rtmp',
2048 'protocol': 'rtmp',
2049 'url': video_info['conn'][0],
2050 'player_url': player_url,
2051 }]
2052 elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
2053 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
2054 if 'rtmpe%3Dyes' in encoded_url_map:
2055 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
2056 formats = []
2057 formats_spec = {}
2058 fmt_list = video_info.get('fmt_list', [''])[0]
2059 if fmt_list:
2060 for fmt in fmt_list.split(','):
2061 spec = fmt.split('/')
2062 if len(spec) > 1:
2063 width_height = spec[1].split('x')
2064 if len(width_height) == 2:
2065 formats_spec[spec[0]] = {
2066 'resolution': spec[1],
2067 'width': int_or_none(width_height[0]),
2068 'height': int_or_none(width_height[1]),
2069 }
2070 for fmt in streaming_formats:
2071 itag = str_or_none(fmt.get('itag'))
2072 if not itag:
2073 continue
2074 quality = fmt.get('quality')
2075 quality_label = fmt.get('qualityLabel') or quality
2076 formats_spec[itag] = {
2077 'asr': int_or_none(fmt.get('audioSampleRate')),
2078 'filesize': int_or_none(fmt.get('contentLength')),
2079 'format_note': quality_label,
2080 'fps': int_or_none(fmt.get('fps')),
2081 'height': int_or_none(fmt.get('height')),
2082 # bitrate for itag 43 is always 2147483647
2083 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
2084 'width': int_or_none(fmt.get('width')),
2085 }
2086
2087 for fmt in streaming_formats:
2088 if fmt.get('drmFamilies') or fmt.get('drm_families'):
2089 continue
2090 url = url_or_none(fmt.get('url'))
2091
2092 if not url:
2093 cipher = fmt.get('cipher') or fmt.get('signatureCipher')
2094 if not cipher:
2095 continue
2096 url_data = compat_parse_qs(cipher)
2097 url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
2098 if not url:
2099 continue
2100 else:
2101 cipher = None
2102 url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2103
2104 stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
2105 # Unsupported FORMAT_STREAM_TYPE_OTF
2106 if stream_type == 3:
2107 continue
2108
2109 format_id = fmt.get('itag') or url_data['itag'][0]
2110 if not format_id:
2111 continue
2112 format_id = compat_str(format_id)
2113
2114 if cipher:
2115 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
2116 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
2117 jsplayer_url_json = self._search_regex(
2118 ASSETS_RE,
2119 embed_webpage if age_gate else video_webpage,
2120 'JS player URL (1)', default=None)
2121 if not jsplayer_url_json and not age_gate:
2122 # We need the embed website after all
2123 if embed_webpage is None:
2124 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
2125 embed_webpage = self._download_webpage(
2126 embed_url, video_id, 'Downloading embed webpage')
2127 jsplayer_url_json = self._search_regex(
2128 ASSETS_RE, embed_webpage, 'JS player URL')
2129
2130 player_url = json.loads(jsplayer_url_json)
2131 if player_url is None:
2132 player_url_json = self._search_regex(
2133 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
2134 video_webpage, 'age gate player URL')
2135 player_url = json.loads(player_url_json)
2136
2137 if 'sig' in url_data:
2138 url += '&signature=' + url_data['sig'][0]
2139 elif 's' in url_data:
2140 encrypted_sig = url_data['s'][0]
2141
2142 if self._downloader.params.get('verbose'):
2143 if player_url is None:
2144 player_desc = 'unknown'
2145 else:
2146 player_type, player_version = self._extract_player_info(player_url)
2147 player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version)
2148 parts_sizes = self._signature_cache_id(encrypted_sig)
2149 self.to_screen('{%s} signature length %s, %s' %
2150 (format_id, parts_sizes, player_desc))
2151
2152 signature = self._decrypt_signature(
2153 encrypted_sig, video_id, player_url, age_gate)
2154 sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
2155 url += '&%s=%s' % (sp, signature)
2156 if 'ratebypass' not in url:
2157 url += '&ratebypass=yes'
2158
2159 dct = {
2160 'format_id': format_id,
2161 'url': url,
2162 'player_url': player_url,
2163 }
2164 if format_id in self._formats:
2165 dct.update(self._formats[format_id])
2166 if format_id in formats_spec:
2167 dct.update(formats_spec[format_id])
2168
2169 # Some itags are not included in DASH manifest thus corresponding formats will
2170 # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).
2171 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
2172 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
2173 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
2174
2175 if width is None:
2176 width = int_or_none(fmt.get('width'))
2177 if height is None:
2178 height = int_or_none(fmt.get('height'))
2179
2180 filesize = int_or_none(url_data.get(
2181 'clen', [None])[0]) or _extract_filesize(url)
2182
2183 quality = url_data.get('quality', [None])[0] or fmt.get('quality')
2184 quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
2185
2186 tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
2187 or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
2188 fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
2189
2190 more_fields = {
2191 'filesize': filesize,
2192 'tbr': tbr,
2193 'width': width,
2194 'height': height,
2195 'fps': fps,
2196 'format_note': quality_label or quality,
2197 }
2198 for key, value in more_fields.items():
2199 if value:
2200 dct[key] = value
2201 type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
2202 if type_:
2203 type_split = type_.split(';')
2204 kind_ext = type_split[0].split('/')
2205 if len(kind_ext) == 2:
2206 kind, _ = kind_ext
2207 dct['ext'] = mimetype2ext(type_split[0])
2208 if kind in ('audio', 'video'):
2209 codecs = None
2210 for mobj in re.finditer(
2211 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
2212 if mobj.group('key') == 'codecs':
2213 codecs = mobj.group('val')
2214 break
2215 if codecs:
2216 dct.update(parse_codecs(codecs))
2217 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
2218 dct['downloader_options'] = {
2219 # Youtube throttles chunks >~10M
2220 'http_chunk_size': 10485760,
2221 }
2222 formats.append(dct)
2223 else:
2224 manifest_url = (
2225 url_or_none(try_get(
2226 player_response,
2227 lambda x: x['streamingData']['hlsManifestUrl'],
2228 compat_str))
2229 or url_or_none(try_get(
2230 video_info, lambda x: x['hlsvp'][0], compat_str)))
2231 if manifest_url:
2232 formats = []
2233 m3u8_formats = self._extract_m3u8_formats(
2234 manifest_url, video_id, 'mp4', fatal=False)
2235 for a_format in m3u8_formats:
2236 itag = self._search_regex(
2237 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
2238 if itag:
2239 a_format['format_id'] = itag
2240 if itag in self._formats:
2241 dct = self._formats[itag].copy()
2242 dct.update(a_format)
2243 a_format = dct
2244 a_format['player_url'] = player_url
2245 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
2246 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
2247 if self._downloader.params.get('youtube_include_hls_manifest', True):
2248 formats.append(a_format)
2249 else:
2250 error_message = extract_unavailable_message()
2251 if not error_message:
2252 error_message = clean_html(try_get(
2253 player_response, lambda x: x['playabilityStatus']['reason'],
2254 compat_str))
2255 if not error_message:
2256 error_message = clean_html(
2257 try_get(video_info, lambda x: x['reason'][0], compat_str))
2258 if error_message:
2259 raise ExtractorError(error_message, expected=True)
2260 raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
2261
2262 # uploader
2263 video_uploader = try_get(
2264 video_info, lambda x: x['author'][0],
2265 compat_str) or str_or_none(video_details.get('author'))
2266 if video_uploader:
2267 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
2268 else:
2269 self._downloader.report_warning('unable to extract uploader name')
2270
2271 # uploader_id
2272 video_uploader_id = None
2273 video_uploader_url = None
2274 mobj = re.search(
2275 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
2276 video_webpage)
2277 if mobj is not None:
2278 video_uploader_id = mobj.group('uploader_id')
2279 video_uploader_url = mobj.group('uploader_url')
2280 else:
2281 owner_profile_url = url_or_none(microformat.get('ownerProfileUrl'))
2282 if owner_profile_url:
2283 video_uploader_id = self._search_regex(
2284 r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id',
2285 default=None)
2286 video_uploader_url = owner_profile_url
2287
2288 channel_id = (
2289 str_or_none(video_details.get('channelId'))
2290 or self._html_search_meta(
2291 'channelId', video_webpage, 'channel id', default=None)
2292 or self._search_regex(
2293 r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
2294 video_webpage, 'channel id', default=None, group='id'))
2295 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
2296
2297 thumbnails = []
2298 thumbnails_list = try_get(
2299 video_details, lambda x: x['thumbnail']['thumbnails'], list) or []
2300 for t in thumbnails_list:
2301 if not isinstance(t, dict):
2302 continue
2303 thumbnail_url = url_or_none(t.get('url'))
2304 if not thumbnail_url:
2305 continue
2306 thumbnails.append({
2307 'url': thumbnail_url,
2308 'width': int_or_none(t.get('width')),
2309 'height': int_or_none(t.get('height')),
2310 })
2311
2312 if not thumbnails:
2313 video_thumbnail = None
2314 # We try first to get a high quality image:
2315 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
2316 video_webpage, re.DOTALL)
2317 if m_thumb is not None:
2318 video_thumbnail = m_thumb.group(1)
2319 thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str)
2320 if thumbnail_url:
2321 video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url)
2322 if video_thumbnail:
2323 thumbnails.append({'url': video_thumbnail})
2324
2325 # upload date
2326 upload_date = self._html_search_meta(
2327 'datePublished', video_webpage, 'upload date', default=None)
2328 if not upload_date:
2329 upload_date = self._search_regex(
2330 [r'(?s)id="eow-date.*?>(.*?)</span>',
2331 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
2332 video_webpage, 'upload date', default=None)
2333 if not upload_date:
2334 upload_date = microformat.get('publishDate') or microformat.get('uploadDate')
2335 upload_date = unified_strdate(upload_date)
2336
2337 video_license = self._html_search_regex(
2338 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
2339 video_webpage, 'license', default=None)
2340
2341 m_music = re.search(
2342 r'''(?x)
2343 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
2344 <ul[^>]*>\s*
2345 <li>(?P<title>.+?)
2346 by (?P<creator>.+?)
2347 (?:
2348 \(.+?\)|
2349 <a[^>]*
2350 (?:
2351 \bhref=["\']/red[^>]*>| # drop possible
2352 >\s*Listen ad-free with YouTube Red # YouTube Red ad
2353 )
2354 .*?
2355 )?</li
2356 ''',
2357 video_webpage)
2358 if m_music:
2359 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
2360 video_creator = clean_html(m_music.group('creator'))
2361 else:
2362 video_alt_title = video_creator = None
2363
2364 def extract_meta(field):
2365 return self._html_search_regex(
2366 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
2367 video_webpage, field, default=None)
2368
2369 track = extract_meta('Song')
2370 artist = extract_meta('Artist')
2371 album = extract_meta('Album')
2372
2373 # Youtube Music Auto-generated description
2374 release_date = release_year = None
2375 if video_description:
2376 mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description)
2377 if mobj:
2378 if not track:
2379 track = mobj.group('track').strip()
2380 if not artist:
2381 artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))
2382 if not album:
2383 album = mobj.group('album'.strip())
2384 release_year = mobj.group('release_year')
2385 release_date = mobj.group('release_date')
2386 if release_date:
2387 release_date = release_date.replace('-', '')
2388 if not release_year:
2389 release_year = int(release_date[:4])
2390 if release_year:
2391 release_year = int(release_year)
2392
2393 m_episode = re.search(
2394 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
2395 video_webpage)
2396 if m_episode:
2397 series = unescapeHTML(m_episode.group('series'))
2398 season_number = int(m_episode.group('season'))
2399 episode_number = int(m_episode.group('episode'))
2400 else:
2401 series = season_number = episode_number = None
2402
2403 m_cat_container = self._search_regex(
2404 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
2405 video_webpage, 'categories', default=None)
2406 category = None
2407 if m_cat_container:
2408 category = self._html_search_regex(
2409 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
2410 default=None)
2411 if not category:
2412 category = try_get(
2413 microformat, lambda x: x['category'], compat_str)
2414 video_categories = None if category is None else [category]
2415
2416 video_tags = [
2417 unescapeHTML(m.group('content'))
2418 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
2419 if not video_tags:
2420 video_tags = try_get(video_details, lambda x: x['keywords'], list)
2421
2422 def _extract_count(count_name):
2423 return str_to_int(self._search_regex(
2424 r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
2425 % re.escape(count_name),
2426 video_webpage, count_name, default=None))
2427
2428 like_count = _extract_count('like')
2429 dislike_count = _extract_count('dislike')
2430
2431 if view_count is None:
2432 view_count = str_to_int(self._search_regex(
2433 r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
2434 'view count', default=None))
2435
2436 average_rating = (
2437 float_or_none(video_details.get('averageRating'))
2438 or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
2439
2440 # subtitles
2441 video_subtitles = self.extract_subtitles(
2442 video_id, video_webpage, has_live_chat_replay)
2443 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
2444
2445 video_duration = try_get(
2446 video_info, lambda x: int_or_none(x['length_seconds'][0]))
2447 if not video_duration:
2448 video_duration = int_or_none(video_details.get('lengthSeconds'))
2449 if not video_duration:
2450 video_duration = parse_duration(self._html_search_meta(
2451 'duration', video_webpage, 'video duration'))
2452
2453 # annotations
2454 video_annotations = None
2455 if self._downloader.params.get('writeannotations', False):
2456 xsrf_token = self._search_regex(
2457 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2',
2458 video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
2459 invideo_url = try_get(
2460 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
2461 if xsrf_token and invideo_url:
2462 xsrf_field_name = self._search_regex(
2463 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
2464 video_webpage, 'xsrf field name',
2465 group='xsrf_field_name', default='session_token')
2466 video_annotations = self._download_webpage(
2467 self._proto_relative_url(invideo_url),
2468 video_id, note='Downloading annotations',
2469 errnote='Unable to download video annotations', fatal=False,
2470 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
2471
2472 chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
2473
2474 # Look for the DASH manifest
2475 if self._downloader.params.get('youtube_include_dash_manifest', True):
2476 dash_mpd_fatal = True
2477 for mpd_url in dash_mpds:
2478 dash_formats = {}
2479 try:
2480 def decrypt_sig(mobj):
2481 s = mobj.group(1)
2482 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2483 return '/signature/%s' % dec_s
2484
2485 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2486
2487 for df in self._extract_mpd_formats(
2488 mpd_url, video_id, fatal=dash_mpd_fatal,
2489 formats_dict=self._formats):
2490 if not df.get('filesize'):
2491 df['filesize'] = _extract_filesize(df['url'])
2492 # Do not overwrite DASH format found in some previous DASH manifest
2493 if df['format_id'] not in dash_formats:
2494 dash_formats[df['format_id']] = df
2495 # Additional DASH manifests may end up in HTTP Error 403 therefore
2496 # allow them to fail without bug report message if we already have
2497 # some DASH manifest succeeded. This is temporary workaround to reduce
2498 # burst of bug reports until we figure out the reason and whether it
2499 # can be fixed at all.
2500 dash_mpd_fatal = False
2501 except (ExtractorError, KeyError) as e:
2502 self.report_warning(
2503 'Skipping DASH manifest: %r' % e, video_id)
2504 if dash_formats:
2505 # Remove the formats we found through non-DASH, they
2506 # contain less info and it can be wrong, because we use
2507 # fixed values (for example the resolution). See
2508 # https://github.com/ytdl-org/youtube-dl/issues/5774 for an
2509 # example.
2510 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
2511 formats.extend(dash_formats.values())
2512
2513 # Check for malformed aspect ratio
2514 stretched_m = re.search(
2515 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2516 video_webpage)
2517 if stretched_m:
2518 w = float(stretched_m.group('w'))
2519 h = float(stretched_m.group('h'))
2520 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2521 # We will only process correct ratios.
2522 if w > 0 and h > 0:
2523 ratio = w / h
2524 for f in formats:
2525 if f.get('vcodec') != 'none':
2526 f['stretched_ratio'] = ratio
2527
2528 if not formats:
2529 if 'reason' in video_info:
2530 if 'The uploader has not made this video available in your country.' in video_info['reason']:
2531 regions_allowed = self._html_search_meta(
2532 'regionsAllowed', video_webpage, default=None)
2533 countries = regions_allowed.split(',') if regions_allowed else None
2534 self.raise_geo_restricted(
2535 msg=video_info['reason'][0], countries=countries)
2536 reason = video_info['reason'][0]
2537 if 'Invalid parameters' in reason:
2538 unavailable_message = extract_unavailable_message()
2539 if unavailable_message:
2540 reason = unavailable_message
2541 raise ExtractorError(
2542 'YouTube said: %s' % reason,
2543 expected=True, video_id=video_id)
2544 if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']):
2545 raise ExtractorError('This video is DRM protected.', expected=True)
2546
2547 self._sort_formats(formats)
2548
2549 self.mark_watched(video_id, video_info, player_response)
2550
2551 return {
2552 'id': video_id,
2553 'uploader': video_uploader,
2554 'uploader_id': video_uploader_id,
2555 'uploader_url': video_uploader_url,
2556 'channel_id': channel_id,
2557 'channel_url': channel_url,
2558 'upload_date': upload_date,
2559 'license': video_license,
2560 'creator': video_creator or artist,
2561 'title': video_title,
2562 'alt_title': video_alt_title or track,
2563 'thumbnails': thumbnails,
2564 'description': video_description,
2565 'categories': video_categories,
2566 'tags': video_tags,
2567 'subtitles': video_subtitles,
2568 'automatic_captions': automatic_captions,
2569 'duration': video_duration,
2570 'age_limit': 18 if age_gate else 0,
2571 'annotations': video_annotations,
2572 'chapters': chapters,
2573 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
2574 'view_count': view_count,
2575 'like_count': like_count,
2576 'dislike_count': dislike_count,
2577 'average_rating': average_rating,
2578 'formats': formats,
2579 'is_live': is_live,
2580 'start_time': start_time,
2581 'end_time': end_time,
2582 'series': series,
2583 'season_number': season_number,
2584 'episode_number': episode_number,
2585 'track': track,
2586 'artist': artist,
2587 'album': album,
2588 'release_date': release_date,
2589 'release_year': release_year,
2590 }
2591
2592
2593 class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
2594 IE_DESC = 'YouTube.com playlists'
2595 _VALID_URL = r"""(?x)(?:
2596 (?:https?://)?
2597 (?:\w+\.)?
2598 (?:
2599 (?:
2600 youtube(?:kids)?\.com|
2601 invidio\.us
2602 )
2603 /
2604 (?:
2605 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))
2606 \? (?:.*?[&;])*? (?:p|a|list)=
2607 | p/
2608 )|
2609 youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
2610 )
2611 (
2612 (?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)?[0-9A-Za-z-_]{10,}
2613 # Top tracks, they can also include dots
2614 |(?:MC)[\w\.]*
2615 )
2616 .*
2617 |
2618 (%(playlist_id)s)
2619 )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
2620 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
2621 _VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&amp;(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?'
2622 _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})'
2623 IE_NAME = 'youtube:playlist'
2624 _TESTS = [{
2625 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2626 'info_dict': {
2627 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2628 'uploader': 'Sergey M.',
2629 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2630 'title': 'youtube-dl public playlist',
2631 },
2632 'playlist_count': 1,
2633 }, {
2634 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2635 'info_dict': {
2636 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2637 'uploader': 'Sergey M.',
2638 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2639 'title': 'youtube-dl empty playlist',
2640 },
2641 'playlist_count': 0,
2642 }, {
2643 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2644 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2645 'info_dict': {
2646 'title': '29C3: Not my department',
2647 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2648 'uploader': 'Christiaan008',
2649 'uploader_id': 'ChRiStIaAn008',
2650 },
2651 'playlist_count': 96,
2652 }, {
2653 'note': 'issue #673',
2654 'url': 'PLBB231211A4F62143',
2655 'info_dict': {
2656 'title': '[OLD]Team Fortress 2 (Class-based LP)',
2657 'id': 'PLBB231211A4F62143',
2658 'uploader': 'Wickydoo',
2659 'uploader_id': 'Wickydoo',
2660 },
2661 'playlist_mincount': 26,
2662 }, {
2663 'note': 'Large playlist',
2664 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
2665 'info_dict': {
2666 'title': 'Uploads from Cauchemar',
2667 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
2668 'uploader': 'Cauchemar',
2669 'uploader_id': 'Cauchemar89',
2670 },
2671 'playlist_mincount': 799,
2672 }, {
2673 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2674 'info_dict': {
2675 'title': 'YDL_safe_search',
2676 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2677 },
2678 'playlist_count': 2,
2679 'skip': 'This playlist is private',
2680 }, {
2681 'note': 'embedded',
2682 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
2683 'playlist_count': 4,
2684 'info_dict': {
2685 'title': 'JODA15',
2686 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
2687 'uploader': 'milan',
2688 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
2689 }
2690 }, {
2691 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2692 'playlist_mincount': 485,
2693 'info_dict': {
2694 'title': '2018 Chinese New Singles (11/6 updated)',
2695 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2696 'uploader': 'LBK',
2697 'uploader_id': 'sdragonfang',
2698 }
2699 }, {
2700 'note': 'Embedded SWF player',
2701 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
2702 'playlist_count': 4,
2703 'info_dict': {
2704 'title': 'JODA7',
2705 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
2706 },
2707 'skip': 'This playlist does not exist',
2708 }, {
2709 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2710 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2711 'info_dict': {
2712 'title': 'Uploads from Interstellar Movie',
2713 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
2714 'uploader': 'Interstellar Movie',
2715 'uploader_id': 'InterstellarMovie1',
2716 },
2717 'playlist_mincount': 21,
2718 }, {
2719 # Playlist URL that does not actually serve a playlist
2720 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2721 'info_dict': {
2722 'id': 'FqZTN594JQw',
2723 'ext': 'webm',
2724 'title': "Smiley's People 01 detective, Adventure Series, Action",
2725 'uploader': 'STREEM',
2726 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
2727 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
2728 'upload_date': '20150526',
2729 'license': 'Standard YouTube License',
2730 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2731 'categories': ['People & Blogs'],
2732 'tags': list,
2733 'view_count': int,
2734 'like_count': int,
2735 'dislike_count': int,
2736 },
2737 'params': {
2738 'skip_download': True,
2739 },
2740 'skip': 'This video is not available.',
2741 'add_ie': [YoutubeIE.ie_key()],
2742 }, {
2743 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
2744 'info_dict': {
2745 'id': 'yeWKywCrFtk',
2746 'ext': 'mp4',
2747 'title': 'Small Scale Baler and Braiding Rugs',
2748 'uploader': 'Backus-Page House Museum',
2749 'uploader_id': 'backuspagemuseum',
2750 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
2751 'upload_date': '20161008',
2752 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
2753 'categories': ['Nonprofits & Activism'],
2754 'tags': list,
2755 'like_count': int,
2756 'dislike_count': int,
2757 },
2758 'params': {
2759 'noplaylist': True,
2760 'skip_download': True,
2761 },
2762 }, {
2763 # https://github.com/ytdl-org/youtube-dl/issues/21844
2764 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2765 'info_dict': {
2766 'title': 'Data Analysis with Dr Mike Pound',
2767 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2768 'uploader_id': 'Computerphile',
2769 'uploader': 'Computerphile',
2770 },
2771 'playlist_mincount': 11,
2772 }, {
2773 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
2774 'only_matching': True,
2775 }, {
2776 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
2777 'only_matching': True,
2778 }, {
2779 # music album playlist
2780 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
2781 'only_matching': True,
2782 }, {
2783 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',
2784 'only_matching': True,
2785 }, {
2786 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
2787 'only_matching': True,
2788 }]
2789
2790 def _real_initialize(self):
2791 self._login()
2792
2793 def extract_videos_from_page(self, page):
2794 ids_in_page = []
2795 titles_in_page = []
2796
2797 for item in re.findall(
2798 r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page):
2799 attrs = extract_attributes(item)
2800 video_id = attrs['data-video-id']
2801 video_title = unescapeHTML(attrs.get('data-title'))
2802 if video_title:
2803 video_title = video_title.strip()
2804 ids_in_page.append(video_id)
2805 titles_in_page.append(video_title)
2806
2807 # Fallback with old _VIDEO_RE
2808 self.extract_videos_from_page_impl(
2809 self._VIDEO_RE, page, ids_in_page, titles_in_page)
2810
2811 # Relaxed fallbacks
2812 self.extract_videos_from_page_impl(
2813 r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page,
2814 ids_in_page, titles_in_page)
2815 self.extract_videos_from_page_impl(
2816 r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page,
2817 ids_in_page, titles_in_page)
2818
2819 return zip(ids_in_page, titles_in_page)
2820
2821 def _extract_mix(self, playlist_id):
2822 # The mixes are generated from a single video
2823 # the id of the playlist is just 'RD' + video_id
2824 ids = []
2825 last_id = playlist_id[-11:]
2826 for n in itertools.count(1):
2827 url = 'https://www.youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
2828 webpage = self._download_webpage(
2829 url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))
2830 new_ids = orderedSet(re.findall(
2831 r'''(?xs)data-video-username=".*?".*?
2832 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
2833 webpage))
2834 # Fetch new pages until all the videos are repeated, it seems that
2835 # there are always 51 unique videos.
2836 new_ids = [_id for _id in new_ids if _id not in ids]
2837 if not new_ids:
2838 break
2839 ids.extend(new_ids)
2840 last_id = ids[-1]
2841
2842 url_results = self._ids_to_results(ids)
2843
2844 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
2845 title_span = (
2846 search_title('playlist-title')
2847 or search_title('title long-title')
2848 or search_title('title'))
2849 title = clean_html(title_span)
2850
2851 return self.playlist_result(url_results, playlist_id, title)
2852
2853 def _extract_playlist(self, playlist_id):
2854 url = self._TEMPLATE_URL % playlist_id
2855 page = self._download_webpage(url, playlist_id)
2856
2857 # the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604)
2858 for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page):
2859 match = match.strip()
2860 # Check if the playlist exists or is private
2861 mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match)
2862 if mobj:
2863 reason = mobj.group('reason')
2864 message = 'This playlist %s' % reason
2865 if 'private' in reason:
2866 message += ', use --username or --netrc to access it'
2867 message += '.'
2868 raise ExtractorError(message, expected=True)
2869 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
2870 raise ExtractorError(
2871 'Invalid parameters. Maybe URL is incorrect.',
2872 expected=True)
2873 elif re.match(r'[^<]*Choose your language[^<]*', match):
2874 continue
2875 else:
2876 self.report_warning('Youtube gives an alert message: ' + match)
2877
2878 playlist_title = self._html_search_regex(
2879 r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
2880 page, 'title', default=None)
2881
2882 _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref='
2883 uploader = self._html_search_regex(
2884 r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,
2885 page, 'uploader', default=None)
2886 mobj = re.search(
2887 r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE,
2888 page)
2889 if mobj:
2890 uploader_id = mobj.group('uploader_id')
2891 uploader_url = compat_urlparse.urljoin(url, mobj.group('path'))
2892 else:
2893 uploader_id = uploader_url = None
2894
2895 has_videos = True
2896
2897 if not playlist_title:
2898 try:
2899 # Some playlist URLs don't actually serve a playlist (e.g.
2900 # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)
2901 next(self._entries(page, playlist_id))
2902 except StopIteration:
2903 has_videos = False
2904
2905 playlist = self.playlist_result(
2906 self._entries(page, playlist_id), playlist_id, playlist_title)
2907 playlist.update({
2908 'uploader': uploader,
2909 'uploader_id': uploader_id,
2910 'uploader_url': uploader_url,
2911 })
2912
2913 return has_videos, playlist
2914
2915 def _check_download_just_video(self, url, playlist_id):
2916 # Check if it's a video-specific URL
2917 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
2918 video_id = query_dict.get('v', [None])[0] or self._search_regex(
2919 r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url,
2920 'video id', default=None)
2921 if video_id:
2922 if self._downloader.params.get('noplaylist'):
2923 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
2924 return video_id, self.url_result(video_id, 'Youtube', video_id=video_id)
2925 else:
2926 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
2927 return video_id, None
2928 return None, None
2929
2930 def _real_extract(self, url):
2931 # Extract playlist id
2932 mobj = re.match(self._VALID_URL, url)
2933 if mobj is None:
2934 raise ExtractorError('Invalid URL: %s' % url)
2935 playlist_id = mobj.group(1) or mobj.group(2)
2936
2937 video_id, video = self._check_download_just_video(url, playlist_id)
2938 if video:
2939 return video
2940
2941 if playlist_id.startswith(('RD', 'UL', 'PU')):
2942 # Mixes require a custom extraction process
2943 return self._extract_mix(playlist_id)
2944
2945 has_videos, playlist = self._extract_playlist(playlist_id)
2946 if has_videos or not video_id:
2947 return playlist
2948
2949 # Some playlist URLs don't actually serve a playlist (see
2950 # https://github.com/ytdl-org/youtube-dl/issues/10537).
2951 # Fallback to plain video extraction if there is a video id
2952 # along with playlist id.
2953 return self.url_result(video_id, 'Youtube', video_id=video_id)
2954
2955
2956 class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
2957 IE_DESC = 'YouTube.com channels'
2958 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie|kids)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)'
2959 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
2960 _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
2961 IE_NAME = 'youtube:channel'
2962 _TESTS = [{
2963 'note': 'paginated channel',
2964 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
2965 'playlist_mincount': 91,
2966 'info_dict': {
2967 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
2968 'title': 'Uploads from lex will',
2969 'uploader': 'lex will',
2970 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2971 }
2972 }, {
2973 'note': 'Age restricted channel',
2974 # from https://www.youtube.com/user/DeusExOfficial
2975 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',
2976 'playlist_mincount': 64,
2977 'info_dict': {
2978 'id': 'UUs0ifCMCm1icqRbqhUINa0w',
2979 'title': 'Uploads from Deus Ex',
2980 'uploader': 'Deus Ex',
2981 'uploader_id': 'DeusExOfficial',
2982 },
2983 }, {
2984 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',
2985 'only_matching': True,
2986 }, {
2987 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA',
2988 'only_matching': True,
2989 }]
2990
2991 @classmethod
2992 def suitable(cls, url):
2993 return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)
2994 else super(YoutubeChannelIE, cls).suitable(url))
2995
2996 def _build_template_url(self, url, channel_id):
2997 return self._TEMPLATE_URL % channel_id
2998
2999 def _real_extract(self, url):
3000 channel_id = self._match_id(url)
3001
3002 url = self._build_template_url(url, channel_id)
3003
3004 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
3005 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
3006 # otherwise fallback on channel by page extraction
3007 channel_page = self._download_webpage(
3008 url + '?view=57', channel_id,
3009 'Downloading channel page', fatal=False)
3010 if channel_page is False:
3011 channel_playlist_id = False
3012 else:
3013 channel_playlist_id = self._html_search_meta(
3014 'channelId', channel_page, 'channel id', default=None)
3015 if not channel_playlist_id:
3016 channel_url = self._html_search_meta(
3017 ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),
3018 channel_page, 'channel url', default=None)
3019 if channel_url:
3020 channel_playlist_id = self._search_regex(
3021 r'vnd\.youtube://user/([0-9A-Za-z_-]+)',
3022 channel_url, 'channel id', default=None)
3023 if channel_playlist_id and channel_playlist_id.startswith('UC'):
3024 playlist_id = 'UU' + channel_playlist_id[2:]
3025 return self.url_result(
3026 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
3027
3028 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
3029 autogenerated = re.search(r'''(?x)
3030 class="[^"]*?(?:
3031 channel-header-autogenerated-label|
3032 yt-channel-title-autogenerated
3033 )[^"]*"''', channel_page) is not None
3034
3035 if autogenerated:
3036 # The videos are contained in a single page
3037 # the ajax pages can't be used, they are empty
3038 entries = [
3039 self.url_result(
3040 video_id, 'Youtube', video_id=video_id,
3041 video_title=video_title)
3042 for video_id, video_title in self.extract_videos_from_page(channel_page)]
3043 return self.playlist_result(entries, channel_id)
3044
3045 try:
3046 next(self._entries(channel_page, channel_id))
3047 except StopIteration:
3048 alert_message = self._html_search_regex(
3049 r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',
3050 channel_page, 'alert', default=None, group='alert')
3051 if alert_message:
3052 raise ExtractorError('Youtube said: %s' % alert_message, expected=True)
3053
3054 return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
3055
3056
3057 class YoutubeUserIE(YoutubeChannelIE):
3058 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
3059 _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9%-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_%-]+)'
3060 _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
3061 IE_NAME = 'youtube:user'
3062
3063 _TESTS = [{
3064 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
3065 'playlist_mincount': 320,
3066 'info_dict': {
3067 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
3068 'title': 'Uploads from The Linux Foundation',
3069 'uploader': 'The Linux Foundation',
3070 'uploader_id': 'TheLinuxFoundation',
3071 }
3072 }, {
3073 # Only available via https://www.youtube.com/c/12minuteathlete/videos
3074 # but not https://www.youtube.com/user/12minuteathlete/videos
3075 'url': 'https://www.youtube.com/c/12minuteathlete/videos',
3076 'playlist_mincount': 249,
3077 'info_dict': {
3078 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
3079 'title': 'Uploads from 12 Minute Athlete',
3080 'uploader': '12 Minute Athlete',
3081 'uploader_id': 'the12minuteathlete',
3082 }
3083 }, {
3084 'url': 'ytuser:phihag',
3085 'only_matching': True,
3086 }, {
3087 'url': 'https://www.youtube.com/c/gametrailers',
3088 'only_matching': True,
3089 }, {
3090 'url': 'https://www.youtube.com/c/Pawe%C5%82Zadro%C5%BCniak',
3091 'only_matching': True,
3092 }, {
3093 'url': 'https://www.youtube.com/gametrailers',
3094 'only_matching': True,
3095 }, {
3096 # This channel is not available, geo restricted to JP
3097 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',
3098 'only_matching': True,
3099 }]
3100
3101 @classmethod
3102 def suitable(cls, url):
3103 # Don't return True if the url can be extracted with other youtube
3104 # extractor, the regex would is too permissive and it would match.
3105 other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)
3106 if any(ie.suitable(url) for ie in other_yt_ies):
3107 return False
3108 else:
3109 return super(YoutubeUserIE, cls).suitable(url)
3110
3111 def _build_template_url(self, url, channel_id):
3112 mobj = re.match(self._VALID_URL, url)
3113 return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))
3114
3115
3116 class YoutubeLiveIE(YoutubeBaseInfoExtractor):
3117 IE_DESC = 'YouTube.com live streams'
3118 _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'
3119 IE_NAME = 'youtube:live'
3120
3121 _TESTS = [{
3122 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
3123 'info_dict': {
3124 'id': 'a48o2S1cPoo',
3125 'ext': 'mp4',
3126 'title': 'The Young Turks - Live Main Show',
3127 'uploader': 'The Young Turks',
3128 'uploader_id': 'TheYoungTurks',
3129 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
3130 'upload_date': '20150715',
3131 'license': 'Standard YouTube License',
3132 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
3133 'categories': ['News & Politics'],
3134 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
3135 'like_count': int,
3136 'dislike_count': int,
3137 },
3138 'params': {
3139 'skip_download': True,
3140 },
3141 }, {
3142 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
3143 'only_matching': True,
3144 }, {
3145 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
3146 'only_matching': True,
3147 }, {
3148 'url': 'https://www.youtube.com/TheYoungTurks/live',
3149 'only_matching': True,
3150 }]
3151
3152 def _real_extract(self, url):
3153 mobj = re.match(self._VALID_URL, url)
3154 channel_id = mobj.group('id')
3155 base_url = mobj.group('base_url')
3156 webpage = self._download_webpage(url, channel_id, fatal=False)
3157 if webpage:
3158 page_type = self._og_search_property(
3159 'type', webpage, 'page type', default='')
3160 video_id = self._html_search_meta(
3161 'videoId', webpage, 'video id', default=None)
3162 if page_type.startswith('video') and video_id and re.match(
3163 r'^[0-9A-Za-z_-]{11}$', video_id):
3164 return self.url_result(video_id, YoutubeIE.ie_key())
3165 return self.url_result(base_url)
3166
3167
3168 class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
3169 IE_DESC = 'YouTube.com user/channel playlists'
3170 _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel|c)/(?P<id>[^/]+)/playlists'
3171 IE_NAME = 'youtube:playlists'
3172
3173 _TESTS = [{
3174 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
3175 'playlist_mincount': 4,
3176 'info_dict': {
3177 'id': 'ThirstForScience',
3178 'title': 'ThirstForScience',
3179 },
3180 }, {
3181 # with "Load more" button
3182 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
3183 'playlist_mincount': 70,
3184 'info_dict': {
3185 'id': 'igorkle1',
3186 'title': 'Игорь Клейнер',
3187 },
3188 }, {
3189 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',
3190 'playlist_mincount': 17,
3191 'info_dict': {
3192 'id': 'UCiU1dHvZObB2iP6xkJ__Icw',
3193 'title': 'Chem Player',
3194 },
3195 'skip': 'Blocked',
3196 }, {
3197 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
3198 'only_matching': True,
3199 }]
3200
3201
3202 class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):
3203 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'
3204
3205
3206 class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
3207 IE_DESC = 'YouTube.com searches'
3208 # there doesn't appear to be a real limit, for example if you search for
3209 # 'python' you get more than 8.000.000 results
3210 _MAX_RESULTS = float('inf')
3211 IE_NAME = 'youtube:search'
3212 _SEARCH_KEY = 'ytsearch'
3213 _EXTRA_QUERY_ARGS = {}
3214 _TESTS = []
3215
3216 def _get_n_results(self, query, n):
3217 """Get a specified number of results for a query"""
3218
3219 videos = []
3220 limit = n
3221
3222 url_query = {
3223 'search_query': query.encode('utf-8'),
3224 }
3225 url_query.update(self._EXTRA_QUERY_ARGS)
3226 result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query)
3227
3228 for pagenum in itertools.count(1):
3229 data = self._download_json(
3230 result_url, video_id='query "%s"' % query,
3231 note='Downloading page %s' % pagenum,
3232 errnote='Unable to download API page',
3233 query={'spf': 'navigate'})
3234 html_content = data[1]['body']['content']
3235
3236 if 'class="search-message' in html_content:
3237 raise ExtractorError(
3238 '[youtube] No video results', expected=True)
3239
3240 new_videos = list(self._process_page(html_content))
3241 videos += new_videos
3242 if not new_videos or len(videos) > limit:
3243 break
3244 next_link = self._html_search_regex(
3245 r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next',
3246 html_content, 'next link', default=None)
3247 if next_link is None:
3248 break
3249 result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link)
3250
3251 if len(videos) > n:
3252 videos = videos[:n]
3253 return self.playlist_result(videos, query)
3254
3255
3256 class YoutubeSearchDateIE(YoutubeSearchIE):
3257 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
3258 _SEARCH_KEY = 'ytsearchdate'
3259 IE_DESC = 'YouTube.com searches, newest videos first'
3260 _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
3261
3262
3263 class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):
3264 IE_DESC = 'YouTube.com search URLs'
3265 IE_NAME = 'youtube:search_url'
3266 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
3267 _TESTS = [{
3268 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
3269 'playlist_mincount': 5,
3270 'info_dict': {
3271 'title': 'youtube-dl test video',
3272 }
3273 }, {
3274 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
3275 'only_matching': True,
3276 }]
3277
3278 def _real_extract(self, url):
3279 mobj = re.match(self._VALID_URL, url)
3280 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
3281 webpage = self._download_webpage(url, query)
3282 return self.playlist_result(self._process_page(webpage), playlist_title=query)
3283
3284
3285 class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
3286 IE_DESC = 'YouTube.com (multi-season) shows'
3287 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
3288 IE_NAME = 'youtube:show'
3289 _TESTS = [{
3290 'url': 'https://www.youtube.com/show/airdisasters',
3291 'playlist_mincount': 5,
3292 'info_dict': {
3293 'id': 'airdisasters',
3294 'title': 'Air Disasters',
3295 }
3296 }]
3297
3298 def _real_extract(self, url):
3299 playlist_id = self._match_id(url)
3300 return super(YoutubeShowIE, self)._real_extract(
3301 'https://www.youtube.com/show/%s/playlists' % playlist_id)
3302
3303
3304 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
3305 """
3306 Base class for feed extractors
3307 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
3308 """
3309 _LOGIN_REQUIRED = True
3310
3311 @property
3312 def IE_NAME(self):
3313 return 'youtube:%s' % self._FEED_NAME
3314
3315 def _real_initialize(self):
3316 self._login()
3317
3318 def _entries(self, page):
3319 # The extraction process is the same as for playlists, but the regex
3320 # for the video ids doesn't contain an index
3321 ids = []
3322 more_widget_html = content_html = page
3323 for page_num in itertools.count(1):
3324 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
3325
3326 # 'recommended' feed has infinite 'load more' and each new portion spins
3327 # the same videos in (sometimes) slightly different order, so we'll check
3328 # for unicity and break when portion has no new videos
3329 new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches)))
3330 if not new_ids:
3331 break
3332
3333 ids.extend(new_ids)
3334
3335 for entry in self._ids_to_results(new_ids):
3336 yield entry
3337
3338 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
3339 if not mobj:
3340 break
3341
3342 more = self._download_json(
3343 'https://www.youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
3344 'Downloading page #%s' % page_num,
3345 transform_source=uppercase_escape,
3346 headers=self._YOUTUBE_CLIENT_HEADERS)
3347 content_html = more['content_html']
3348 more_widget_html = more['load_more_widget_html']
3349
3350 def _real_extract(self, url):
3351 page = self._download_webpage(
3352 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
3353 self._PLAYLIST_TITLE)
3354 return self.playlist_result(
3355 self._entries(page), playlist_title=self._PLAYLIST_TITLE)
3356
3357
3358 class YoutubeWatchLaterIE(YoutubePlaylistIE):
3359 IE_NAME = 'youtube:watchlater'
3360 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
3361 _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'
3362
3363 _TESTS = [{
3364 'url': 'https://www.youtube.com/playlist?list=WL',
3365 'only_matching': True,
3366 }, {
3367 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',
3368 'only_matching': True,
3369 }]
3370
3371 def _real_extract(self, url):
3372 _, video = self._check_download_just_video(url, 'WL')
3373 if video:
3374 return video
3375 _, playlist = self._extract_playlist('WL')
3376 return playlist
3377
3378
3379 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
3380 IE_NAME = 'youtube:favorites'
3381 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
3382 _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
3383 _LOGIN_REQUIRED = True
3384
3385 def _real_extract(self, url):
3386 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
3387 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
3388 return self.url_result(playlist_id, 'YoutubePlaylist')
3389
3390
3391 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3392 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
3393 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'
3394 _FEED_NAME = 'recommended'
3395 _PLAYLIST_TITLE = 'Youtube Recommended videos'
3396
3397
3398 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
3399 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
3400 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
3401 _FEED_NAME = 'subscriptions'
3402 _PLAYLIST_TITLE = 'Youtube Subscriptions'
3403
3404
3405 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
3406 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
3407 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
3408 _FEED_NAME = 'history'
3409 _PLAYLIST_TITLE = 'Youtube History'
3410
3411
3412 class YoutubeTruncatedURLIE(InfoExtractor):
3413 IE_NAME = 'youtube:truncated_url'
3414 IE_DESC = False # Do not list
3415 _VALID_URL = r'''(?x)
3416 (?:https?://)?
3417 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3418 (?:watch\?(?:
3419 feature=[a-z_]+|
3420 annotation_id=annotation_[^&]+|
3421 x-yt-cl=[0-9]+|
3422 hl=[^&]*|
3423 t=[0-9]+
3424 )?
3425 |
3426 attribution_link\?a=[^&]+
3427 )
3428 $
3429 '''
3430
3431 _TESTS = [{
3432 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
3433 'only_matching': True,
3434 }, {
3435 'url': 'https://www.youtube.com/watch?',
3436 'only_matching': True,
3437 }, {
3438 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3439 'only_matching': True,
3440 }, {
3441 'url': 'https://www.youtube.com/watch?feature=foo',
3442 'only_matching': True,
3443 }, {
3444 'url': 'https://www.youtube.com/watch?hl=en-GB',
3445 'only_matching': True,
3446 }, {
3447 'url': 'https://www.youtube.com/watch?t=2372',
3448 'only_matching': True,
3449 }]
3450
3451 def _real_extract(self, url):
3452 raise ExtractorError(
3453 'Did you forget to quote the URL? Remember that & is a meta '
3454 'character in most shells, so you want to put the URL in quotes, '
3455 'like youtube-dl '
3456 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3457 ' or simply youtube-dl BaW_jenozKc .',
3458 expected=True)
3459
3460
3461 class YoutubeTruncatedIDIE(InfoExtractor):
3462 IE_NAME = 'youtube:truncated_id'
3463 IE_DESC = False # Do not list
3464 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
3465
3466 _TESTS = [{
3467 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3468 'only_matching': True,
3469 }]
3470
3471 def _real_extract(self, url):
3472 video_id = self._match_id(url)
3473 raise ExtractorError(
3474 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3475 expected=True)