]> jfr.im git - yt-dlp.git/blob - youtube_dlc/extractor/youtube.py
[skip travis] mention trim-file-name
[yt-dlp.git] / youtube_dlc / extractor / youtube.py
1 # coding: utf-8
2
3 from __future__ import unicode_literals
4
5
6 import itertools
7 import json
8 import os.path
9 import random
10 import re
11 import time
12 import traceback
13
14 from .common import InfoExtractor, SearchInfoExtractor
15 from ..jsinterp import JSInterpreter
16 from ..swfinterp import SWFInterpreter
17 from ..compat import (
18 compat_chr,
19 compat_HTTPError,
20 compat_kwargs,
21 compat_parse_qs,
22 compat_urllib_parse_unquote,
23 compat_urllib_parse_unquote_plus,
24 compat_urllib_parse_urlencode,
25 compat_urllib_parse_urlparse,
26 compat_urlparse,
27 compat_str,
28 )
29 from ..utils import (
30 bool_or_none,
31 clean_html,
32 error_to_compat_str,
33 extract_attributes,
34 ExtractorError,
35 float_or_none,
36 get_element_by_attribute,
37 get_element_by_id,
38 int_or_none,
39 mimetype2ext,
40 orderedSet,
41 parse_codecs,
42 parse_count,
43 parse_duration,
44 remove_quotes,
45 remove_start,
46 smuggle_url,
47 str_or_none,
48 str_to_int,
49 try_get,
50 unescapeHTML,
51 unified_strdate,
52 unsmuggle_url,
53 uppercase_escape,
54 url_or_none,
55 urlencode_postdata,
56 )
57
58
59 class YoutubeBaseInfoExtractor(InfoExtractor):
60 """Provide base functions for Youtube extractors"""
61 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
62 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
63
64 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
65 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
66 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
67
68 _NETRC_MACHINE = 'youtube'
69 # If True it will raise an error if no login info is provided
70 _LOGIN_REQUIRED = False
71
72 _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}'
73
74 _YOUTUBE_CLIENT_HEADERS = {
75 'x-youtube-client-name': '1',
76 'x-youtube-client-version': '1.20200609.04.02',
77 }
78
79 def _set_language(self):
80 self._set_cookie(
81 '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
82 # YouTube sets the expire time to about two months
83 expire_time=time.time() + 2 * 30 * 24 * 3600)
84
85 def _ids_to_results(self, ids):
86 return [
87 self.url_result(vid_id, 'Youtube', video_id=vid_id)
88 for vid_id in ids]
89
90 def _login(self):
91 """
92 Attempt to log in to YouTube.
93 True is returned if successful or skipped.
94 False is returned if login failed.
95
96 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
97 """
98 username, password = self._get_login_info()
99 # No authentication to be performed
100 if username is None:
101 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
102 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
103 return True
104
105 login_page = self._download_webpage(
106 self._LOGIN_URL, None,
107 note='Downloading login page',
108 errnote='unable to fetch login page', fatal=False)
109 if login_page is False:
110 return
111
112 login_form = self._hidden_inputs(login_page)
113
114 def req(url, f_req, note, errnote):
115 data = login_form.copy()
116 data.update({
117 'pstMsg': 1,
118 'checkConnection': 'youtube',
119 'checkedDomains': 'youtube',
120 'hl': 'en',
121 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
122 'f.req': json.dumps(f_req),
123 'flowName': 'GlifWebSignIn',
124 'flowEntry': 'ServiceLogin',
125 # TODO: reverse actual botguard identifier generation algo
126 'bgRequest': '["identifier",""]',
127 })
128 return self._download_json(
129 url, None, note=note, errnote=errnote,
130 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
131 fatal=False,
132 data=urlencode_postdata(data), headers={
133 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
134 'Google-Accounts-XSRF': 1,
135 })
136
137 def warn(message):
138 self._downloader.report_warning(message)
139
140 lookup_req = [
141 username,
142 None, [], None, 'US', None, None, 2, False, True,
143 [
144 None, None,
145 [2, 1, None, 1,
146 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
147 None, [], 4],
148 1, [None, None, []], None, None, None, True
149 ],
150 username,
151 ]
152
153 lookup_results = req(
154 self._LOOKUP_URL, lookup_req,
155 'Looking up account info', 'Unable to look up account info')
156
157 if lookup_results is False:
158 return False
159
160 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
161 if not user_hash:
162 warn('Unable to extract user hash')
163 return False
164
165 challenge_req = [
166 user_hash,
167 None, 1, None, [1, None, None, None, [password, None, True]],
168 [
169 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
170 1, [None, None, []], None, None, None, True
171 ]]
172
173 challenge_results = req(
174 self._CHALLENGE_URL, challenge_req,
175 'Logging in', 'Unable to log in')
176
177 if challenge_results is False:
178 return
179
180 login_res = try_get(challenge_results, lambda x: x[0][5], list)
181 if login_res:
182 login_msg = try_get(login_res, lambda x: x[5], compat_str)
183 warn(
184 'Unable to login: %s' % 'Invalid password'
185 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
186 return False
187
188 res = try_get(challenge_results, lambda x: x[0][-1], list)
189 if not res:
190 warn('Unable to extract result entry')
191 return False
192
193 login_challenge = try_get(res, lambda x: x[0][0], list)
194 if login_challenge:
195 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
196 if challenge_str == 'TWO_STEP_VERIFICATION':
197 # SEND_SUCCESS - TFA code has been successfully sent to phone
198 # QUOTA_EXCEEDED - reached the limit of TFA codes
199 status = try_get(login_challenge, lambda x: x[5], compat_str)
200 if status == 'QUOTA_EXCEEDED':
201 warn('Exceeded the limit of TFA codes, try later')
202 return False
203
204 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
205 if not tl:
206 warn('Unable to extract TL')
207 return False
208
209 tfa_code = self._get_tfa_info('2-step verification code')
210
211 if not tfa_code:
212 warn(
213 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
214 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
215 return False
216
217 tfa_code = remove_start(tfa_code, 'G-')
218
219 tfa_req = [
220 user_hash, None, 2, None,
221 [
222 9, None, None, None, None, None, None, None,
223 [None, tfa_code, True, 2]
224 ]]
225
226 tfa_results = req(
227 self._TFA_URL.format(tl), tfa_req,
228 'Submitting TFA code', 'Unable to submit TFA code')
229
230 if tfa_results is False:
231 return False
232
233 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
234 if tfa_res:
235 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
236 warn(
237 'Unable to finish TFA: %s' % 'Invalid TFA code'
238 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
239 return False
240
241 check_cookie_url = try_get(
242 tfa_results, lambda x: x[0][-1][2], compat_str)
243 else:
244 CHALLENGES = {
245 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
246 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
247 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
248 }
249 challenge = CHALLENGES.get(
250 challenge_str,
251 '%s returned error %s.' % (self.IE_NAME, challenge_str))
252 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
253 return False
254 else:
255 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
256
257 if not check_cookie_url:
258 warn('Unable to extract CheckCookie URL')
259 return False
260
261 check_cookie_results = self._download_webpage(
262 check_cookie_url, None, 'Checking cookie', fatal=False)
263
264 if check_cookie_results is False:
265 return False
266
267 if 'https://myaccount.google.com/' not in check_cookie_results:
268 warn('Unable to log in')
269 return False
270
271 return True
272
273 def _download_webpage_handle(self, *args, **kwargs):
274 query = kwargs.get('query', {}).copy()
275 query['disable_polymer'] = 'true'
276 kwargs['query'] = query
277 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
278 *args, **compat_kwargs(kwargs))
279
280 def _real_initialize(self):
281 if self._downloader is None:
282 return
283 self._set_language()
284 if not self._login():
285 return
286
287
288 class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
289 # Extract entries from page with "Load more" button
290 def _entries(self, page, playlist_id):
291 more_widget_html = content_html = page
292 for page_num in itertools.count(1):
293 for entry in self._process_page(content_html):
294 yield entry
295
296 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
297 if not mobj:
298 break
299
300 count = 0
301 retries = 3
302 while count <= retries:
303 try:
304 # Downloading page may result in intermittent 5xx HTTP error
305 # that is usually worked around with a retry
306 more = self._download_json(
307 'https://www.youtube.com/%s' % mobj.group('more'), playlist_id,
308 'Downloading page #%s%s'
309 % (page_num, ' (retry #%d)' % count if count else ''),
310 transform_source=uppercase_escape,
311 headers=self._YOUTUBE_CLIENT_HEADERS)
312 break
313 except ExtractorError as e:
314 if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
315 count += 1
316 if count <= retries:
317 continue
318 raise
319
320 content_html = more['content_html']
321 if not content_html.strip():
322 # Some webpages show a "Load more" button but they don't
323 # have more videos
324 break
325 more_widget_html = more['load_more_widget_html']
326
327
328 class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
329 def _process_page(self, content):
330 for video_id, video_title in self.extract_videos_from_page(content):
331 yield self.url_result(video_id, 'Youtube', video_id, video_title)
332
333 def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page):
334 for mobj in re.finditer(video_re, page):
335 # The link with index 0 is not the first video of the playlist (not sure if still actual)
336 if 'index' in mobj.groupdict() and mobj.group('id') == '0':
337 continue
338 video_id = mobj.group('id')
339 video_title = unescapeHTML(
340 mobj.group('title')) if 'title' in mobj.groupdict() else None
341 if video_title:
342 video_title = video_title.strip()
343 if video_title == '► Play all':
344 video_title = None
345 try:
346 idx = ids_in_page.index(video_id)
347 if video_title and not titles_in_page[idx]:
348 titles_in_page[idx] = video_title
349 except ValueError:
350 ids_in_page.append(video_id)
351 titles_in_page.append(video_title)
352
353 def extract_videos_from_page(self, page):
354 ids_in_page = []
355 titles_in_page = []
356 self.extract_videos_from_page_impl(
357 self._VIDEO_RE, page, ids_in_page, titles_in_page)
358 return zip(ids_in_page, titles_in_page)
359
360
361 class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
362 def _process_page(self, content):
363 for playlist_id in orderedSet(re.findall(
364 r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',
365 content)):
366 yield self.url_result(
367 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
368
369 def _real_extract(self, url):
370 playlist_id = self._match_id(url)
371 webpage = self._download_webpage(url, playlist_id)
372 title = self._og_search_title(webpage, fatal=False)
373 return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
374
375
376 class YoutubeIE(YoutubeBaseInfoExtractor):
377 IE_DESC = 'YouTube.com'
378 _VALID_URL = r"""(?x)^
379 (
380 (?:https?://|//) # http(s):// or protocol-independent URL
381 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
382 (?:www\.)?deturl\.com/www\.youtube\.com/|
383 (?:www\.)?pwnyoutube\.com/|
384 (?:www\.)?hooktube\.com/|
385 (?:www\.)?yourepeat\.com/|
386 tube\.majestyc\.net/|
387 # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
388 (?:(?:www|dev)\.)?invidio\.us/|
389 (?:(?:www|no)\.)?invidiou\.sh/|
390 (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/|
391 (?:www\.)?invidious\.kabi\.tk/|
392 (?:www\.)?invidious\.13ad\.de/|
393 (?:www\.)?invidious\.mastodon\.host/|
394 (?:www\.)?invidious\.nixnet\.xyz/|
395 (?:www\.)?invidious\.drycat\.fr/|
396 (?:www\.)?tube\.poal\.co/|
397 (?:www\.)?vid\.wxzm\.sx/|
398 (?:www\.)?yewtu\.be/|
399 (?:www\.)?yt\.elukerio\.org/|
400 (?:www\.)?yt\.lelux\.fi/|
401 (?:www\.)?invidious\.ggc-project\.de/|
402 (?:www\.)?yt\.maisputain\.ovh/|
403 (?:www\.)?invidious\.13ad\.de/|
404 (?:www\.)?invidious\.toot\.koeln/|
405 (?:www\.)?invidious\.fdn\.fr/|
406 (?:www\.)?watch\.nettohikari\.com/|
407 (?:www\.)?kgg2m7yk5aybusll\.onion/|
408 (?:www\.)?qklhadlycap4cnod\.onion/|
409 (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
410 (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
411 (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
412 (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
413 (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
414 (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
415 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
416 (?:.*?\#/)? # handle anchor (#/) redirect urls
417 (?: # the various things that can precede the ID:
418 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
419 |(?: # or the v= param in all its forms
420 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
421 (?:\?|\#!?) # the params delimiter ? or # or #!
422 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
423 v=
424 )
425 ))
426 |(?:
427 youtu\.be| # just youtu.be/xxxx
428 vid\.plus| # or vid.plus/xxxx
429 zwearz\.com/watch| # or zwearz.com/watch/xxxx
430 )/
431 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
432 )
433 )? # all until now is optional -> you can pass the naked ID
434 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
435 (?!.*?\blist=
436 (?:
437 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
438 WL # WL are handled by the watch later IE
439 )
440 )
441 (?(1).+)? # if we found the ID, everything can follow
442 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
443 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
444 _PLAYER_INFO_RE = (
445 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P<ext>[a-z]+)$',
446 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.(?P<ext>[a-z]+)$',
447 )
448 _formats = {
449 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
450 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
451 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
452 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
453 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
454 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
455 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
456 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
457 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
458 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
459 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
460 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
461 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
462 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
463 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
464 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
465 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
466 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
467
468
469 # 3D videos
470 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
471 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
472 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
473 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
474 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
475 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
476 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
477
478 # Apple HTTP Live Streaming
479 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
480 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
481 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
482 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
483 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
484 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
485 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
486 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
487
488 # DASH mp4 video
489 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
490 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
491 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
492 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
493 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
494 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
495 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
496 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
497 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
498 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
499 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
500 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
501
502 # Dash mp4 audio
503 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
504 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
505 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
506 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
507 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
508 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
509 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
510
511 # Dash webm
512 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
513 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
514 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
515 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
516 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
517 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
518 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
519 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
520 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
521 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
522 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
523 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
524 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
525 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
526 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
527 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
528 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
529 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
530 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
531 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
532 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
533 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
534
535 # Dash webm audio
536 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
537 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
538
539 # Dash webm audio with opus inside
540 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
541 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
542 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
543
544 # RTMP (unnamed)
545 '_rtmp': {'protocol': 'rtmp'},
546
547 # av01 video only formats sometimes served with "unknown" codecs
548 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
549 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
550 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
551 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
552 }
553 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
554
555 _GEO_BYPASS = False
556
557 IE_NAME = 'youtube'
558 _TESTS = [
559 {
560 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
561 'info_dict': {
562 'id': 'BaW_jenozKc',
563 'ext': 'mp4',
564 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
565 'uploader': 'Philipp Hagemeister',
566 'uploader_id': 'phihag',
567 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
568 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
569 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
570 'upload_date': '20121002',
571 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
572 'categories': ['Science & Technology'],
573 'tags': ['youtube-dl'],
574 'duration': 10,
575 'view_count': int,
576 'like_count': int,
577 'dislike_count': int,
578 'start_time': 1,
579 'end_time': 9,
580 }
581 },
582 {
583 'url': 'https://www.youtube.com/watch?v=UxxajLWwzqY',
584 'note': 'Test generic use_cipher_signature video (#897)',
585 'info_dict': {
586 'id': 'UxxajLWwzqY',
587 'ext': 'mp4',
588 'upload_date': '20120506',
589 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
590 'alt_title': 'I Love It (feat. Charli XCX)',
591 'description': 'md5:19a2f98d9032b9311e686ed039564f63',
592 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
593 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
594 'iconic ep', 'iconic', 'love', 'it'],
595 'duration': 180,
596 'uploader': 'Icona Pop',
597 'uploader_id': 'IconaPop',
598 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop',
599 'creator': 'Icona Pop',
600 'track': 'I Love It (feat. Charli XCX)',
601 'artist': 'Icona Pop',
602 }
603 },
604 {
605 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
606 'note': 'Test VEVO video with age protection (#956)',
607 'info_dict': {
608 'id': '07FYdnEawAQ',
609 'ext': 'mp4',
610 'upload_date': '20130703',
611 'title': 'Justin Timberlake - Tunnel Vision (Official Music Video) (Explicit)',
612 'alt_title': 'Tunnel Vision',
613 'description': 'md5:07dab3356cde4199048e4c7cd93471e1',
614 'duration': 419,
615 'uploader': 'justintimberlakeVEVO',
616 'uploader_id': 'justintimberlakeVEVO',
617 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',
618 'creator': 'Justin Timberlake',
619 'track': 'Tunnel Vision',
620 'artist': 'Justin Timberlake',
621 'age_limit': 18,
622 }
623 },
624 {
625 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
626 'note': 'Embed-only video (#1746)',
627 'info_dict': {
628 'id': 'yZIXLfi8CZQ',
629 'ext': 'mp4',
630 'upload_date': '20120608',
631 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
632 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
633 'uploader': 'SET India',
634 'uploader_id': 'setindia',
635 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
636 'age_limit': 18,
637 }
638 },
639 {
640 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
641 'note': 'Use the first video ID in the URL',
642 'info_dict': {
643 'id': 'BaW_jenozKc',
644 'ext': 'mp4',
645 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
646 'uploader': 'Philipp Hagemeister',
647 'uploader_id': 'phihag',
648 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
649 'upload_date': '20121002',
650 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
651 'categories': ['Science & Technology'],
652 'tags': ['youtube-dl'],
653 'duration': 10,
654 'view_count': int,
655 'like_count': int,
656 'dislike_count': int,
657 },
658 'params': {
659 'skip_download': True,
660 },
661 },
662 {
663 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
664 'note': '256k DASH audio (format 141) via DASH manifest',
665 'info_dict': {
666 'id': 'a9LDPn-MO4I',
667 'ext': 'm4a',
668 'upload_date': '20121002',
669 'uploader_id': '8KVIDEO',
670 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
671 'description': '',
672 'uploader': '8KVIDEO',
673 'title': 'UHDTV TEST 8K VIDEO.mp4'
674 },
675 'params': {
676 'youtube_include_dash_manifest': True,
677 'format': '141',
678 },
679 'skip': 'format 141 not served anymore',
680 },
681 # DASH manifest with encrypted signature
682 {
683 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
684 'info_dict': {
685 'id': 'IB3lcPjvWLA',
686 'ext': 'm4a',
687 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
688 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
689 'duration': 244,
690 'uploader': 'AfrojackVEVO',
691 'uploader_id': 'AfrojackVEVO',
692 'upload_date': '20131011',
693 },
694 'params': {
695 'youtube_include_dash_manifest': True,
696 'format': '141/bestaudio[ext=m4a]',
697 },
698 },
699 # JS player signature function name containing $
700 {
701 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
702 'info_dict': {
703 'id': 'nfWlot6h_JM',
704 'ext': 'm4a',
705 'title': 'Taylor Swift - Shake It Off',
706 'description': 'md5:307195cd21ff7fa352270fe884570ef0',
707 'duration': 242,
708 'uploader': 'TaylorSwiftVEVO',
709 'uploader_id': 'TaylorSwiftVEVO',
710 'upload_date': '20140818',
711 },
712 'params': {
713 'youtube_include_dash_manifest': True,
714 'format': '141/bestaudio[ext=m4a]',
715 },
716 },
717 # Controversy video
718 {
719 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
720 'info_dict': {
721 'id': 'T4XJQO3qol8',
722 'ext': 'mp4',
723 'duration': 219,
724 'upload_date': '20100909',
725 'uploader': 'Amazing Atheist',
726 'uploader_id': 'TheAmazingAtheist',
727 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
728 'title': 'Burning Everyone\'s Koran',
729 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
730 }
731 },
732 # Normal age-gate video (No vevo, embed allowed)
733 {
734 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
735 'info_dict': {
736 'id': 'HtVdAasjOgU',
737 'ext': 'mp4',
738 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
739 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
740 'duration': 142,
741 'uploader': 'The Witcher',
742 'uploader_id': 'WitcherGame',
743 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
744 'upload_date': '20140605',
745 'age_limit': 18,
746 },
747 },
748 # Age-gate video with encrypted signature
749 {
750 'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU',
751 'info_dict': {
752 'id': '6kLq3WMV1nU',
753 'ext': 'mp4',
754 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
755 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
756 'duration': 246,
757 'uploader': 'LloydVEVO',
758 'uploader_id': 'LloydVEVO',
759 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO',
760 'upload_date': '20110629',
761 'age_limit': 18,
762 },
763 },
764 # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
765 # YouTube Red ad is not captured for creator
766 {
767 'url': '__2ABJjxzNo',
768 'info_dict': {
769 'id': '__2ABJjxzNo',
770 'ext': 'mp4',
771 'duration': 266,
772 'upload_date': '20100430',
773 'uploader_id': 'deadmau5',
774 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
775 'creator': 'Dada Life, deadmau5',
776 'description': 'md5:12c56784b8032162bb936a5f76d55360',
777 'uploader': 'deadmau5',
778 'title': 'Deadmau5 - Some Chords (HD)',
779 'alt_title': 'This Machine Kills Some Chords',
780 },
781 'expected_warnings': [
782 'DASH manifest missing',
783 ]
784 },
785 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
786 {
787 'url': 'lqQg6PlCWgI',
788 'info_dict': {
789 'id': 'lqQg6PlCWgI',
790 'ext': 'mp4',
791 'duration': 6085,
792 'upload_date': '20150827',
793 'uploader_id': 'olympic',
794 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
795 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
796 'uploader': 'Olympic',
797 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
798 },
799 'params': {
800 'skip_download': 'requires avconv',
801 }
802 },
803 # Non-square pixels
804 {
805 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
806 'info_dict': {
807 'id': '_b-2C3KPAM0',
808 'ext': 'mp4',
809 'stretched_ratio': 16 / 9.,
810 'duration': 85,
811 'upload_date': '20110310',
812 'uploader_id': 'AllenMeow',
813 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
814 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
815 'uploader': '孫ᄋᄅ',
816 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
817 },
818 },
819 # url_encoded_fmt_stream_map is empty string
820 {
821 'url': 'qEJwOuvDf7I',
822 'info_dict': {
823 'id': 'qEJwOuvDf7I',
824 'ext': 'webm',
825 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
826 'description': '',
827 'upload_date': '20150404',
828 'uploader_id': 'spbelect',
829 'uploader': 'Наблюдатели Петербурга',
830 },
831 'params': {
832 'skip_download': 'requires avconv',
833 },
834 'skip': 'This live event has ended.',
835 },
836 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
837 {
838 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
839 'info_dict': {
840 'id': 'FIl7x6_3R5Y',
841 'ext': 'webm',
842 'title': 'md5:7b81415841e02ecd4313668cde88737a',
843 'description': 'md5:116377fd2963b81ec4ce64b542173306',
844 'duration': 220,
845 'upload_date': '20150625',
846 'uploader_id': 'dorappi2000',
847 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
848 'uploader': 'dorappi2000',
849 'formats': 'mincount:31',
850 },
851 'skip': 'not actual anymore',
852 },
853 # DASH manifest with segment_list
854 {
855 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
856 'md5': '8ce563a1d667b599d21064e982ab9e31',
857 'info_dict': {
858 'id': 'CsmdDsKjzN8',
859 'ext': 'mp4',
860 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
861 'uploader': 'Airtek',
862 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
863 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
864 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
865 },
866 'params': {
867 'youtube_include_dash_manifest': True,
868 'format': '135', # bestvideo
869 },
870 'skip': 'This live event has ended.',
871 },
872 {
873 # Multifeed videos (multiple cameras), URL is for Main Camera
874 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
875 'info_dict': {
876 'id': 'jqWvoWXjCVs',
877 'title': 'teamPGP: Rocket League Noob Stream',
878 'description': 'md5:dc7872fb300e143831327f1bae3af010',
879 },
880 'playlist': [{
881 'info_dict': {
882 'id': 'jqWvoWXjCVs',
883 'ext': 'mp4',
884 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
885 'description': 'md5:dc7872fb300e143831327f1bae3af010',
886 'duration': 7335,
887 'upload_date': '20150721',
888 'uploader': 'Beer Games Beer',
889 'uploader_id': 'beergamesbeer',
890 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
891 'license': 'Standard YouTube License',
892 },
893 }, {
894 'info_dict': {
895 'id': '6h8e8xoXJzg',
896 'ext': 'mp4',
897 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
898 'description': 'md5:dc7872fb300e143831327f1bae3af010',
899 'duration': 7337,
900 'upload_date': '20150721',
901 'uploader': 'Beer Games Beer',
902 'uploader_id': 'beergamesbeer',
903 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
904 'license': 'Standard YouTube License',
905 },
906 }, {
907 'info_dict': {
908 'id': 'PUOgX5z9xZw',
909 'ext': 'mp4',
910 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
911 'description': 'md5:dc7872fb300e143831327f1bae3af010',
912 'duration': 7337,
913 'upload_date': '20150721',
914 'uploader': 'Beer Games Beer',
915 'uploader_id': 'beergamesbeer',
916 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
917 'license': 'Standard YouTube License',
918 },
919 }, {
920 'info_dict': {
921 'id': 'teuwxikvS5k',
922 'ext': 'mp4',
923 'title': 'teamPGP: Rocket League Noob Stream (zim)',
924 'description': 'md5:dc7872fb300e143831327f1bae3af010',
925 'duration': 7334,
926 'upload_date': '20150721',
927 'uploader': 'Beer Games Beer',
928 'uploader_id': 'beergamesbeer',
929 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
930 'license': 'Standard YouTube License',
931 },
932 }],
933 'params': {
934 'skip_download': True,
935 },
936 'skip': 'This video is not available.',
937 },
938 {
939 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
940 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
941 'info_dict': {
942 'id': 'gVfLd0zydlo',
943 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
944 },
945 'playlist_count': 2,
946 'skip': 'Not multifeed anymore',
947 },
948 {
949 'url': 'https://vid.plus/FlRa-iH7PGw',
950 'only_matching': True,
951 },
952 {
953 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
954 'only_matching': True,
955 },
956 {
957 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
958 # Also tests cut-off URL expansion in video description (see
959 # https://github.com/ytdl-org/youtube-dl/issues/1892,
960 # https://github.com/ytdl-org/youtube-dl/issues/8164)
961 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
962 'info_dict': {
963 'id': 'lsguqyKfVQg',
964 'ext': 'mp4',
965 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
966 'alt_title': 'Dark Walk - Position Music',
967 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
968 'duration': 133,
969 'upload_date': '20151119',
970 'uploader_id': 'IronSoulElf',
971 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
972 'uploader': 'IronSoulElf',
973 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
974 'track': 'Dark Walk - Position Music',
975 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
976 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
977 },
978 'params': {
979 'skip_download': True,
980 },
981 },
982 {
983 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
984 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
985 'only_matching': True,
986 },
987 {
988 # Video with yt:stretch=17:0
989 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
990 'info_dict': {
991 'id': 'Q39EVAstoRM',
992 'ext': 'mp4',
993 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
994 'description': 'md5:ee18a25c350637c8faff806845bddee9',
995 'upload_date': '20151107',
996 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
997 'uploader': 'CH GAMER DROID',
998 },
999 'params': {
1000 'skip_download': True,
1001 },
1002 'skip': 'This video does not exist.',
1003 },
1004 {
1005 # Video licensed under Creative Commons
1006 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
1007 'info_dict': {
1008 'id': 'M4gD1WSo5mA',
1009 'ext': 'mp4',
1010 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
1011 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
1012 'duration': 721,
1013 'upload_date': '20150127',
1014 'uploader_id': 'BerkmanCenter',
1015 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
1016 'uploader': 'The Berkman Klein Center for Internet & Society',
1017 'license': 'Creative Commons Attribution license (reuse allowed)',
1018 },
1019 'params': {
1020 'skip_download': True,
1021 },
1022 },
1023 {
1024 # Channel-like uploader_url
1025 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
1026 'info_dict': {
1027 'id': 'eQcmzGIKrzg',
1028 'ext': 'mp4',
1029 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
1030 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
1031 'duration': 4060,
1032 'upload_date': '20151119',
1033 'uploader': 'Bernie Sanders',
1034 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
1035 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
1036 'license': 'Creative Commons Attribution license (reuse allowed)',
1037 },
1038 'params': {
1039 'skip_download': True,
1040 },
1041 },
1042 {
1043 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
1044 'only_matching': True,
1045 },
1046 {
1047 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
1048 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
1049 'only_matching': True,
1050 },
1051 {
1052 # Rental video preview
1053 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
1054 'info_dict': {
1055 'id': 'uGpuVWrhIzE',
1056 'ext': 'mp4',
1057 'title': 'Piku - Trailer',
1058 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
1059 'upload_date': '20150811',
1060 'uploader': 'FlixMatrix',
1061 'uploader_id': 'FlixMatrixKaravan',
1062 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
1063 'license': 'Standard YouTube License',
1064 },
1065 'params': {
1066 'skip_download': True,
1067 },
1068 'skip': 'This video is not available.',
1069 },
1070 {
1071 # YouTube Red video with episode data
1072 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
1073 'info_dict': {
1074 'id': 'iqKdEhx-dD4',
1075 'ext': 'mp4',
1076 'title': 'Isolation - Mind Field (Ep 1)',
1077 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
1078 'duration': 2085,
1079 'upload_date': '20170118',
1080 'uploader': 'Vsauce',
1081 'uploader_id': 'Vsauce',
1082 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
1083 'series': 'Mind Field',
1084 'season_number': 1,
1085 'episode_number': 1,
1086 },
1087 'params': {
1088 'skip_download': True,
1089 },
1090 'expected_warnings': [
1091 'Skipping DASH manifest',
1092 ],
1093 },
1094 {
1095 # The following content has been identified by the YouTube community
1096 # as inappropriate or offensive to some audiences.
1097 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
1098 'info_dict': {
1099 'id': '6SJNVb0GnPI',
1100 'ext': 'mp4',
1101 'title': 'Race Differences in Intelligence',
1102 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
1103 'duration': 965,
1104 'upload_date': '20140124',
1105 'uploader': 'New Century Foundation',
1106 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
1107 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
1108 },
1109 'params': {
1110 'skip_download': True,
1111 },
1112 },
1113 {
1114 # itag 212
1115 'url': '1t24XAntNCY',
1116 'only_matching': True,
1117 },
1118 {
1119 # geo restricted to JP
1120 'url': 'sJL6WA-aGkQ',
1121 'only_matching': True,
1122 },
1123 {
1124 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
1125 'only_matching': True,
1126 },
1127 {
1128 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1129 'only_matching': True,
1130 },
1131 {
1132 # DRM protected
1133 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1134 'only_matching': True,
1135 },
1136 {
1137 # Video with unsupported adaptive stream type formats
1138 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1139 'info_dict': {
1140 'id': 'Z4Vy8R84T1U',
1141 'ext': 'mp4',
1142 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1143 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1144 'duration': 433,
1145 'upload_date': '20130923',
1146 'uploader': 'Amelia Putri Harwita',
1147 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1148 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1149 'formats': 'maxcount:10',
1150 },
1151 'params': {
1152 'skip_download': True,
1153 'youtube_include_dash_manifest': False,
1154 },
1155 'skip': 'not actual anymore',
1156 },
1157 {
1158 # Youtube Music Auto-generated description
1159 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1160 'info_dict': {
1161 'id': 'MgNrAu2pzNs',
1162 'ext': 'mp4',
1163 'title': 'Voyeur Girl',
1164 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1165 'upload_date': '20190312',
1166 'uploader': 'Stephen - Topic',
1167 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
1168 'artist': 'Stephen',
1169 'track': 'Voyeur Girl',
1170 'album': 'it\'s too much love to know my dear',
1171 'release_date': '20190313',
1172 'release_year': 2019,
1173 },
1174 'params': {
1175 'skip_download': True,
1176 },
1177 },
1178 {
1179 # Youtube Music Auto-generated description
1180 # Retrieve 'artist' field from 'Artist:' in video description
1181 # when it is present on youtube music video
1182 'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY',
1183 'info_dict': {
1184 'id': 'k0jLE7tTwjY',
1185 'ext': 'mp4',
1186 'title': 'Latch Feat. Sam Smith',
1187 'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335',
1188 'upload_date': '20150110',
1189 'uploader': 'Various Artists - Topic',
1190 'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w',
1191 'artist': 'Disclosure',
1192 'track': 'Latch Feat. Sam Smith',
1193 'album': 'Latch Featuring Sam Smith',
1194 'release_date': '20121008',
1195 'release_year': 2012,
1196 },
1197 'params': {
1198 'skip_download': True,
1199 },
1200 },
1201 {
1202 # Youtube Music Auto-generated description
1203 # handle multiple artists on youtube music video
1204 'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA',
1205 'info_dict': {
1206 'id': '74qn0eJSjpA',
1207 'ext': 'mp4',
1208 'title': 'Eastside',
1209 'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2',
1210 'upload_date': '20180710',
1211 'uploader': 'Benny Blanco - Topic',
1212 'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A',
1213 'artist': 'benny blanco, Halsey, Khalid',
1214 'track': 'Eastside',
1215 'album': 'Eastside',
1216 'release_date': '20180713',
1217 'release_year': 2018,
1218 },
1219 'params': {
1220 'skip_download': True,
1221 },
1222 },
1223 {
1224 # Youtube Music Auto-generated description
1225 # handle youtube music video with release_year and no release_date
1226 'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M',
1227 'info_dict': {
1228 'id': '-hcAI0g-f5M',
1229 'ext': 'mp4',
1230 'title': 'Put It On Me',
1231 'description': 'md5:f6422397c07c4c907c6638e1fee380a5',
1232 'upload_date': '20180426',
1233 'uploader': 'Matt Maeson - Topic',
1234 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ',
1235 'artist': 'Matt Maeson',
1236 'track': 'Put It On Me',
1237 'album': 'The Hearse',
1238 'release_date': None,
1239 'release_year': 2018,
1240 },
1241 'params': {
1242 'skip_download': True,
1243 },
1244 },
1245 {
1246 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1247 'only_matching': True,
1248 },
1249 {
1250 # invalid -> valid video id redirection
1251 'url': 'DJztXj2GPfl',
1252 'info_dict': {
1253 'id': 'DJztXj2GPfk',
1254 'ext': 'mp4',
1255 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1256 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1257 'upload_date': '20090125',
1258 'uploader': 'Prochorowka',
1259 'uploader_id': 'Prochorowka',
1260 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1261 'artist': 'Panjabi MC',
1262 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1263 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1264 },
1265 'params': {
1266 'skip_download': True,
1267 },
1268 },
1269 {
1270 # empty description results in an empty string
1271 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1272 'info_dict': {
1273 'id': 'x41yOUIvK2k',
1274 'ext': 'mp4',
1275 'title': 'IMG 3456',
1276 'description': '',
1277 'upload_date': '20170613',
1278 'uploader_id': 'ElevageOrVert',
1279 'uploader': 'ElevageOrVert',
1280 },
1281 'params': {
1282 'skip_download': True,
1283 },
1284 },
1285 ]
1286
1287 def __init__(self, *args, **kwargs):
1288 super(YoutubeIE, self).__init__(*args, **kwargs)
1289 self._player_cache = {}
1290
1291 def report_video_info_webpage_download(self, video_id):
1292 """Report attempt to download video info webpage."""
1293 self.to_screen('%s: Downloading video info webpage' % video_id)
1294
1295 def report_information_extraction(self, video_id):
1296 """Report attempt to extract video information."""
1297 self.to_screen('%s: Extracting video information' % video_id)
1298
1299 def report_unavailable_format(self, video_id, format):
1300 """Report extracted video URL."""
1301 self.to_screen('%s: Format %s not available' % (video_id, format))
1302
1303 def report_rtmp_download(self):
1304 """Indicate the download will use the RTMP protocol."""
1305 self.to_screen('RTMP download detected')
1306
1307 def _signature_cache_id(self, example_sig):
1308 """ Return a string representation of a signature """
1309 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
1310
1311 @classmethod
1312 def _extract_player_info(cls, player_url):
1313 for player_re in cls._PLAYER_INFO_RE:
1314 id_m = re.search(player_re, player_url)
1315 if id_m:
1316 break
1317 else:
1318 raise ExtractorError('Cannot identify player %r' % player_url)
1319 return id_m.group('ext'), id_m.group('id')
1320
1321 def _extract_signature_function(self, video_id, player_url, example_sig):
1322 player_type, player_id = self._extract_player_info(player_url)
1323
1324 # Read from filesystem cache
1325 func_id = '%s_%s_%s' % (
1326 player_type, player_id, self._signature_cache_id(example_sig))
1327 assert os.path.basename(func_id) == func_id
1328
1329 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
1330 if cache_spec is not None:
1331 return lambda s: ''.join(s[i] for i in cache_spec)
1332
1333 download_note = (
1334 'Downloading player %s' % player_url
1335 if self._downloader.params.get('verbose') else
1336 'Downloading %s player %s' % (player_type, player_id)
1337 )
1338 if player_type == 'js':
1339 code = self._download_webpage(
1340 player_url, video_id,
1341 note=download_note,
1342 errnote='Download of %s failed' % player_url)
1343 res = self._parse_sig_js(code)
1344 elif player_type == 'swf':
1345 urlh = self._request_webpage(
1346 player_url, video_id,
1347 note=download_note,
1348 errnote='Download of %s failed' % player_url)
1349 code = urlh.read()
1350 res = self._parse_sig_swf(code)
1351 else:
1352 assert False, 'Invalid player type %r' % player_type
1353
1354 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1355 cache_res = res(test_string)
1356 cache_spec = [ord(c) for c in cache_res]
1357
1358 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
1359 return res
1360
1361 def _print_sig_code(self, func, example_sig):
1362 def gen_sig_code(idxs):
1363 def _genslice(start, end, step):
1364 starts = '' if start == 0 else str(start)
1365 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
1366 steps = '' if step == 1 else (':%d' % step)
1367 return 's[%s%s%s]' % (starts, ends, steps)
1368
1369 step = None
1370 # Quelch pyflakes warnings - start will be set when step is set
1371 start = '(Never used)'
1372 for i, prev in zip(idxs[1:], idxs[:-1]):
1373 if step is not None:
1374 if i - prev == step:
1375 continue
1376 yield _genslice(start, prev, step)
1377 step = None
1378 continue
1379 if i - prev in [-1, 1]:
1380 step = i - prev
1381 start = prev
1382 continue
1383 else:
1384 yield 's[%d]' % prev
1385 if step is None:
1386 yield 's[%d]' % i
1387 else:
1388 yield _genslice(start, i, step)
1389
1390 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1391 cache_res = func(test_string)
1392 cache_spec = [ord(c) for c in cache_res]
1393 expr_code = ' + '.join(gen_sig_code(cache_spec))
1394 signature_id_tuple = '(%s)' % (
1395 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
1396 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
1397 ' return %s\n') % (signature_id_tuple, expr_code)
1398 self.to_screen('Extracted signature function:\n' + code)
1399
1400 def _parse_sig_js(self, jscode):
1401 funcname = self._search_regex(
1402 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1403 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1404 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1405 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1406 # Obsolete patterns
1407 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1408 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
1409 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1410 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1411 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1412 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1413 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1414 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
1415 jscode, 'Initial JS player signature function name', group='sig')
1416
1417 jsi = JSInterpreter(jscode)
1418 initial_function = jsi.extract_function(funcname)
1419 return lambda s: initial_function([s])
1420
1421 def _parse_sig_swf(self, file_contents):
1422 swfi = SWFInterpreter(file_contents)
1423 TARGET_CLASSNAME = 'SignatureDecipher'
1424 searched_class = swfi.extract_class(TARGET_CLASSNAME)
1425 initial_function = swfi.extract_function(searched_class, 'decipher')
1426 return lambda s: initial_function([s])
1427
1428 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1429 """Turn the encrypted s field into a working signature"""
1430
1431 if player_url is None:
1432 raise ExtractorError('Cannot decrypt signature without player_url')
1433
1434 if player_url.startswith('//'):
1435 player_url = 'https:' + player_url
1436 elif not re.match(r'https?://', player_url):
1437 player_url = compat_urlparse.urljoin(
1438 'https://www.youtube.com', player_url)
1439 try:
1440 player_id = (player_url, self._signature_cache_id(s))
1441 if player_id not in self._player_cache:
1442 func = self._extract_signature_function(
1443 video_id, player_url, s
1444 )
1445 self._player_cache[player_id] = func
1446 func = self._player_cache[player_id]
1447 if self._downloader.params.get('youtube_print_sig_code'):
1448 self._print_sig_code(func, s)
1449 return func(s)
1450 except Exception as e:
1451 tb = traceback.format_exc()
1452 raise ExtractorError(
1453 'Signature extraction failed: ' + tb, cause=e)
1454
1455 def _get_subtitles(self, video_id, webpage, has_live_chat_replay):
1456 try:
1457 subs_doc = self._download_xml(
1458 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1459 video_id, note=False)
1460 except ExtractorError as err:
1461 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
1462 return {}
1463
1464 sub_lang_list = {}
1465 for track in subs_doc.findall('track'):
1466 lang = track.attrib['lang_code']
1467 if lang in sub_lang_list:
1468 continue
1469 sub_formats = []
1470 for ext in self._SUBTITLE_FORMATS:
1471 params = compat_urllib_parse_urlencode({
1472 'lang': lang,
1473 'v': video_id,
1474 'fmt': ext,
1475 'name': track.attrib['name'].encode('utf-8'),
1476 })
1477 sub_formats.append({
1478 'url': 'https://www.youtube.com/api/timedtext?' + params,
1479 'ext': ext,
1480 })
1481 sub_lang_list[lang] = sub_formats
1482 if has_live_chat_replay:
1483 sub_lang_list['live_chat'] = [
1484 {
1485 'video_id': video_id,
1486 'ext': 'json',
1487 'protocol': 'youtube_live_chat_replay',
1488 },
1489 ]
1490 if not sub_lang_list:
1491 self._downloader.report_warning('video doesn\'t have subtitles')
1492 return {}
1493 return sub_lang_list
1494
1495 def _get_ytplayer_config(self, video_id, webpage):
1496 patterns = (
1497 # User data may contain arbitrary character sequences that may affect
1498 # JSON extraction with regex, e.g. when '};' is contained the second
1499 # regex won't capture the whole JSON. Yet working around by trying more
1500 # concrete regex first keeping in mind proper quoted string handling
1501 # to be implemented in future that will replace this workaround (see
1502 # https://github.com/ytdl-org/youtube-dl/issues/7468,
1503 # https://github.com/ytdl-org/youtube-dl/pull/7599)
1504 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1505 r';ytplayer\.config\s*=\s*({.+?});',
1506 )
1507 config = self._search_regex(
1508 patterns, webpage, 'ytplayer.config', default=None)
1509 if config:
1510 return self._parse_json(
1511 uppercase_escape(config), video_id, fatal=False)
1512
1513 def _get_yt_initial_data(self, video_id, webpage):
1514 config = self._search_regex(
1515 (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
1516 r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
1517 webpage, 'ytInitialData', default=None)
1518 if config:
1519 return self._parse_json(
1520 uppercase_escape(config), video_id, fatal=False)
1521
1522 def _get_automatic_captions(self, video_id, webpage):
1523 """We need the webpage for getting the captions url, pass it as an
1524 argument to speed up the process."""
1525 self.to_screen('%s: Looking for automatic captions' % video_id)
1526 player_config = self._get_ytplayer_config(video_id, webpage)
1527 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
1528 if not player_config:
1529 self._downloader.report_warning(err_msg)
1530 return {}
1531 try:
1532 args = player_config['args']
1533 caption_url = args.get('ttsurl')
1534 if caption_url:
1535 timestamp = args['timestamp']
1536 # We get the available subtitles
1537 list_params = compat_urllib_parse_urlencode({
1538 'type': 'list',
1539 'tlangs': 1,
1540 'asrs': 1,
1541 })
1542 list_url = caption_url + '&' + list_params
1543 caption_list = self._download_xml(list_url, video_id)
1544 original_lang_node = caption_list.find('track')
1545 if original_lang_node is None:
1546 self._downloader.report_warning('Video doesn\'t have automatic captions')
1547 return {}
1548 original_lang = original_lang_node.attrib['lang_code']
1549 caption_kind = original_lang_node.attrib.get('kind', '')
1550
1551 sub_lang_list = {}
1552 for lang_node in caption_list.findall('target'):
1553 sub_lang = lang_node.attrib['lang_code']
1554 sub_formats = []
1555 for ext in self._SUBTITLE_FORMATS:
1556 params = compat_urllib_parse_urlencode({
1557 'lang': original_lang,
1558 'tlang': sub_lang,
1559 'fmt': ext,
1560 'ts': timestamp,
1561 'kind': caption_kind,
1562 })
1563 sub_formats.append({
1564 'url': caption_url + '&' + params,
1565 'ext': ext,
1566 })
1567 sub_lang_list[sub_lang] = sub_formats
1568 return sub_lang_list
1569
1570 def make_captions(sub_url, sub_langs):
1571 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1572 caption_qs = compat_parse_qs(parsed_sub_url.query)
1573 captions = {}
1574 for sub_lang in sub_langs:
1575 sub_formats = []
1576 for ext in self._SUBTITLE_FORMATS:
1577 caption_qs.update({
1578 'tlang': [sub_lang],
1579 'fmt': [ext],
1580 })
1581 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1582 query=compat_urllib_parse_urlencode(caption_qs, True)))
1583 sub_formats.append({
1584 'url': sub_url,
1585 'ext': ext,
1586 })
1587 captions[sub_lang] = sub_formats
1588 return captions
1589
1590 # New captions format as of 22.06.2017
1591 player_response = args.get('player_response')
1592 if player_response and isinstance(player_response, compat_str):
1593 player_response = self._parse_json(
1594 player_response, video_id, fatal=False)
1595 if player_response:
1596 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1597 caption_tracks = renderer['captionTracks']
1598 for caption_track in caption_tracks:
1599 if 'kind' not in caption_track:
1600 # not an automatic transcription
1601 continue
1602 base_url = caption_track['baseUrl']
1603 sub_lang_list = []
1604 for lang in renderer['translationLanguages']:
1605 lang_code = lang.get('languageCode')
1606 if lang_code:
1607 sub_lang_list.append(lang_code)
1608 return make_captions(base_url, sub_lang_list)
1609
1610 self._downloader.report_warning("Couldn't find automatic captions for %s" % video_id)
1611 return {}
1612 # Some videos don't provide ttsurl but rather caption_tracks and
1613 # caption_translation_languages (e.g. 20LmZk1hakA)
1614 # Does not used anymore as of 22.06.2017
1615 caption_tracks = args['caption_tracks']
1616 caption_translation_languages = args['caption_translation_languages']
1617 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
1618 sub_lang_list = []
1619 for lang in caption_translation_languages.split(','):
1620 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1621 sub_lang = lang_qs.get('lc', [None])[0]
1622 if sub_lang:
1623 sub_lang_list.append(sub_lang)
1624 return make_captions(caption_url, sub_lang_list)
1625 # An extractor error can be raise by the download process if there are
1626 # no automatic captions but there are subtitles
1627 except (KeyError, IndexError, ExtractorError):
1628 self._downloader.report_warning(err_msg)
1629 return {}
1630
1631 def _mark_watched(self, video_id, video_info, player_response):
1632 playback_url = url_or_none(try_get(
1633 player_response,
1634 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
1635 video_info, lambda x: x['videostats_playback_base_url'][0]))
1636 if not playback_url:
1637 return
1638 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1639 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1640
1641 # cpn generation algorithm is reverse engineered from base.js.
1642 # In fact it works even with dummy cpn.
1643 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1644 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1645
1646 qs.update({
1647 'ver': ['2'],
1648 'cpn': [cpn],
1649 })
1650 playback_url = compat_urlparse.urlunparse(
1651 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
1652
1653 self._download_webpage(
1654 playback_url, video_id, 'Marking watched',
1655 'Unable to mark watched', fatal=False)
1656
1657 @staticmethod
1658 def _extract_urls(webpage):
1659 # Embedded YouTube player
1660 entries = [
1661 unescapeHTML(mobj.group('url'))
1662 for mobj in re.finditer(r'''(?x)
1663 (?:
1664 <iframe[^>]+?src=|
1665 data-video-url=|
1666 <embed[^>]+?src=|
1667 embedSWF\(?:\s*|
1668 <object[^>]+data=|
1669 new\s+SWFObject\(
1670 )
1671 (["\'])
1672 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1673 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
1674 \1''', webpage)]
1675
1676 # lazyYT YouTube embed
1677 entries.extend(list(map(
1678 unescapeHTML,
1679 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1680
1681 # Wordpress "YouTube Video Importer" plugin
1682 matches = re.findall(r'''(?x)<div[^>]+
1683 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1684 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1685 entries.extend(m[-1] for m in matches)
1686
1687 return entries
1688
1689 @staticmethod
1690 def _extract_url(webpage):
1691 urls = YoutubeIE._extract_urls(webpage)
1692 return urls[0] if urls else None
1693
1694 @classmethod
1695 def extract_id(cls, url):
1696 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1697 if mobj is None:
1698 raise ExtractorError('Invalid URL: %s' % url)
1699 video_id = mobj.group(2)
1700 return video_id
1701
1702 def _extract_chapters_from_json(self, webpage, video_id, duration):
1703 if not webpage:
1704 return
1705 initial_data = self._parse_json(
1706 self._search_regex(
1707 r'window\["ytInitialData"\] = (.+);\n', webpage,
1708 'player args', default='{}'),
1709 video_id, fatal=False)
1710 if not initial_data or not isinstance(initial_data, dict):
1711 return
1712 chapters_list = try_get(
1713 initial_data,
1714 lambda x: x['playerOverlays']
1715 ['playerOverlayRenderer']
1716 ['decoratedPlayerBarRenderer']
1717 ['decoratedPlayerBarRenderer']
1718 ['playerBar']
1719 ['chapteredPlayerBarRenderer']
1720 ['chapters'],
1721 list)
1722 if not chapters_list:
1723 return
1724
1725 def chapter_time(chapter):
1726 return float_or_none(
1727 try_get(
1728 chapter,
1729 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1730 int),
1731 scale=1000)
1732 chapters = []
1733 for next_num, chapter in enumerate(chapters_list, start=1):
1734 start_time = chapter_time(chapter)
1735 if start_time is None:
1736 continue
1737 end_time = (chapter_time(chapters_list[next_num])
1738 if next_num < len(chapters_list) else duration)
1739 if end_time is None:
1740 continue
1741 title = try_get(
1742 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1743 compat_str)
1744 chapters.append({
1745 'start_time': start_time,
1746 'end_time': end_time,
1747 'title': title,
1748 })
1749 return chapters
1750
1751 @staticmethod
1752 def _extract_chapters_from_description(description, duration):
1753 if not description:
1754 return None
1755 chapter_lines = re.findall(
1756 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1757 description)
1758 if not chapter_lines:
1759 return None
1760 chapters = []
1761 for next_num, (chapter_line, time_point) in enumerate(
1762 chapter_lines, start=1):
1763 start_time = parse_duration(time_point)
1764 if start_time is None:
1765 continue
1766 if start_time > duration:
1767 break
1768 end_time = (duration if next_num == len(chapter_lines)
1769 else parse_duration(chapter_lines[next_num][1]))
1770 if end_time is None:
1771 continue
1772 if end_time > duration:
1773 end_time = duration
1774 if start_time > end_time:
1775 break
1776 chapter_title = re.sub(
1777 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1778 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1779 chapters.append({
1780 'start_time': start_time,
1781 'end_time': end_time,
1782 'title': chapter_title,
1783 })
1784 return chapters
1785
1786 def _extract_chapters(self, webpage, description, video_id, duration):
1787 return (self._extract_chapters_from_json(webpage, video_id, duration)
1788 or self._extract_chapters_from_description(description, duration))
1789
1790 def _real_extract(self, url):
1791 url, smuggled_data = unsmuggle_url(url, {})
1792
1793 proto = (
1794 'http' if self._downloader.params.get('prefer_insecure', False)
1795 else 'https')
1796
1797 start_time = None
1798 end_time = None
1799 parsed_url = compat_urllib_parse_urlparse(url)
1800 for component in [parsed_url.fragment, parsed_url.query]:
1801 query = compat_parse_qs(component)
1802 if start_time is None and 't' in query:
1803 start_time = parse_duration(query['t'][0])
1804 if start_time is None and 'start' in query:
1805 start_time = parse_duration(query['start'][0])
1806 if end_time is None and 'end' in query:
1807 end_time = parse_duration(query['end'][0])
1808
1809 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1810 mobj = re.search(self._NEXT_URL_RE, url)
1811 if mobj:
1812 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
1813 video_id = self.extract_id(url)
1814
1815 # Get video webpage
1816 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
1817 video_webpage, urlh = self._download_webpage_handle(url, video_id)
1818
1819 qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query)
1820 video_id = qs.get('v', [None])[0] or video_id
1821
1822 # Attempt to extract SWF player URL
1823 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1824 if mobj is not None:
1825 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1826 else:
1827 player_url = None
1828
1829 dash_mpds = []
1830
1831 def add_dash_mpd(video_info):
1832 dash_mpd = video_info.get('dashmpd')
1833 if dash_mpd and dash_mpd[0] not in dash_mpds:
1834 dash_mpds.append(dash_mpd[0])
1835
1836 def add_dash_mpd_pr(pl_response):
1837 dash_mpd = url_or_none(try_get(
1838 pl_response, lambda x: x['streamingData']['dashManifestUrl'],
1839 compat_str))
1840 if dash_mpd and dash_mpd not in dash_mpds:
1841 dash_mpds.append(dash_mpd)
1842
1843 is_live = None
1844 view_count = None
1845
1846 def extract_view_count(v_info):
1847 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1848
1849 def extract_player_response(player_response, video_id):
1850 pl_response = str_or_none(player_response)
1851 if not pl_response:
1852 return
1853 pl_response = self._parse_json(pl_response, video_id, fatal=False)
1854 if isinstance(pl_response, dict):
1855 add_dash_mpd_pr(pl_response)
1856 return pl_response
1857
1858 player_response = {}
1859
1860 # Get video info
1861 video_info = {}
1862 embed_webpage = None
1863 if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+'
1864 or re.search(r'player-age-gate-content">', video_webpage) is not None):
1865 cookie_keys = self._get_cookies('https://www.youtube.com').keys()
1866 age_gate = True
1867 # We simulate the access to the video from www.youtube.com/v/{video_id}
1868 # this can be viewed without login into Youtube
1869 url = proto + '://www.youtube.com/embed/%s' % video_id
1870 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
1871 # check if video is only playable on youtube - if so it requires auth (cookies)
1872 if re.search(r'player-unavailable">', embed_webpage) is not None:
1873 '''
1874 # TODO apply this patch when Support for Python 2.6(!) and above drops
1875 if ({'VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID'} <= cookie_keys
1876 or {'VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO'} <= cookie_keys):
1877 '''
1878 if (set(('VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID')) <= set(cookie_keys)
1879 or set(('VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO')) <= set(cookie_keys)):
1880 age_gate = False
1881 # Try looking directly into the video webpage
1882 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1883 if ytplayer_config:
1884 args = ytplayer_config['args']
1885 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1886 # Convert to the same format returned by compat_parse_qs
1887 video_info = dict((k, [v]) for k, v in args.items())
1888 add_dash_mpd(video_info)
1889 # Rental video is not rented but preview is available (e.g.
1890 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1891 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1892 if not video_info and args.get('ypc_vid'):
1893 return self.url_result(
1894 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1895 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1896 is_live = True
1897 if not player_response:
1898 player_response = extract_player_response(args.get('player_response'), video_id)
1899 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1900 add_dash_mpd_pr(player_response)
1901 else:
1902 raise ExtractorError('Video is age restricted and only playable on Youtube. Requires cookies!', expected=True)
1903 else:
1904 data = compat_urllib_parse_urlencode({
1905 'video_id': video_id,
1906 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1907 'sts': self._search_regex(
1908 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1909 })
1910 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1911 try:
1912 video_info_webpage = self._download_webpage(
1913 video_info_url, video_id,
1914 note='Refetching age-gated info webpage',
1915 errnote='unable to download video info webpage')
1916 except ExtractorError:
1917 video_info_webpage = None
1918 if video_info_webpage:
1919 video_info = compat_parse_qs(video_info_webpage)
1920 pl_response = video_info.get('player_response', [None])[0]
1921 player_response = extract_player_response(pl_response, video_id)
1922 add_dash_mpd(video_info)
1923 view_count = extract_view_count(video_info)
1924 else:
1925 age_gate = False
1926 # Try looking directly into the video webpage
1927 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1928 if ytplayer_config:
1929 args = ytplayer_config['args']
1930 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1931 # Convert to the same format returned by compat_parse_qs
1932 video_info = dict((k, [v]) for k, v in args.items())
1933 add_dash_mpd(video_info)
1934 # Rental video is not rented but preview is available (e.g.
1935 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1936 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1937 if not video_info and args.get('ypc_vid'):
1938 return self.url_result(
1939 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1940 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1941 is_live = True
1942 if not player_response:
1943 player_response = extract_player_response(args.get('player_response'), video_id)
1944 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1945 add_dash_mpd_pr(player_response)
1946
1947 def extract_unavailable_message():
1948 messages = []
1949 for tag, kind in (('h1', 'message'), ('div', 'submessage')):
1950 msg = self._html_search_regex(
1951 r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
1952 video_webpage, 'unavailable %s' % kind, default=None)
1953 if msg:
1954 messages.append(msg)
1955 if messages:
1956 return '\n'.join(messages)
1957
1958 if not video_info and not player_response:
1959 unavailable_message = extract_unavailable_message()
1960 if not unavailable_message:
1961 unavailable_message = 'Unable to extract video data'
1962 raise ExtractorError(
1963 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
1964
1965 if not isinstance(video_info, dict):
1966 video_info = {}
1967
1968 video_details = try_get(
1969 player_response, lambda x: x['videoDetails'], dict) or {}
1970
1971 microformat = try_get(
1972 player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}
1973
1974 video_title = video_info.get('title', [None])[0] or video_details.get('title')
1975 if not video_title:
1976 self._downloader.report_warning('Unable to extract video title')
1977 video_title = '_'
1978
1979 description_original = video_description = get_element_by_id("eow-description", video_webpage)
1980 if video_description:
1981
1982 def replace_url(m):
1983 redir_url = compat_urlparse.urljoin(url, m.group(1))
1984 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1985 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1986 qs = compat_parse_qs(parsed_redir_url.query)
1987 q = qs.get('q')
1988 if q and q[0]:
1989 return q[0]
1990 return redir_url
1991
1992 description_original = video_description = re.sub(r'''(?x)
1993 <a\s+
1994 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1995 (?:title|href)="([^"]+)"\s+
1996 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1997 class="[^"]*"[^>]*>
1998 [^<]+\.{3}\s*
1999 </a>
2000 ''', replace_url, video_description)
2001 video_description = clean_html(video_description)
2002 else:
2003 video_description = video_details.get('shortDescription')
2004 if video_description is None:
2005 video_description = self._html_search_meta('description', video_webpage)
2006
2007 if not smuggled_data.get('force_singlefeed', False):
2008 if not self._downloader.params.get('noplaylist'):
2009 multifeed_metadata_list = try_get(
2010 player_response,
2011 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
2012 compat_str) or try_get(
2013 video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
2014 if multifeed_metadata_list:
2015 entries = []
2016 feed_ids = []
2017 for feed in multifeed_metadata_list.split(','):
2018 # Unquote should take place before split on comma (,) since textual
2019 # fields may contain comma as well (see
2020 # https://github.com/ytdl-org/youtube-dl/issues/8536)
2021 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
2022
2023 def feed_entry(name):
2024 return try_get(feed_data, lambda x: x[name][0], compat_str)
2025
2026 feed_id = feed_entry('id')
2027 if not feed_id:
2028 continue
2029 feed_title = feed_entry('title')
2030 title = video_title
2031 if feed_title:
2032 title += ' (%s)' % feed_title
2033 entries.append({
2034 '_type': 'url_transparent',
2035 'ie_key': 'Youtube',
2036 'url': smuggle_url(
2037 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
2038 {'force_singlefeed': True}),
2039 'title': title,
2040 })
2041 feed_ids.append(feed_id)
2042 self.to_screen(
2043 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
2044 % (', '.join(feed_ids), video_id))
2045 return self.playlist_result(entries, video_id, video_title, video_description)
2046 else:
2047 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
2048
2049 if view_count is None:
2050 view_count = extract_view_count(video_info)
2051 if view_count is None and video_details:
2052 view_count = int_or_none(video_details.get('viewCount'))
2053 if view_count is None and microformat:
2054 view_count = int_or_none(microformat.get('viewCount'))
2055
2056 if is_live is None:
2057 is_live = bool_or_none(video_details.get('isLive'))
2058
2059 has_live_chat_replay = False
2060 if not is_live:
2061 yt_initial_data = self._get_yt_initial_data(video_id, video_webpage)
2062 try:
2063 yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
2064 has_live_chat_replay = True
2065 except (KeyError, IndexError, TypeError):
2066 pass
2067
2068 # Check for "rental" videos
2069 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
2070 raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
2071
2072 def _extract_filesize(media_url):
2073 return int_or_none(self._search_regex(
2074 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
2075
2076 streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
2077 streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
2078
2079 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
2080 self.report_rtmp_download()
2081 formats = [{
2082 'format_id': '_rtmp',
2083 'protocol': 'rtmp',
2084 'url': video_info['conn'][0],
2085 'player_url': player_url,
2086 }]
2087 elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
2088 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
2089 if 'rtmpe%3Dyes' in encoded_url_map:
2090 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
2091 formats = []
2092 formats_spec = {}
2093 fmt_list = video_info.get('fmt_list', [''])[0]
2094 if fmt_list:
2095 for fmt in fmt_list.split(','):
2096 spec = fmt.split('/')
2097 if len(spec) > 1:
2098 width_height = spec[1].split('x')
2099 if len(width_height) == 2:
2100 formats_spec[spec[0]] = {
2101 'resolution': spec[1],
2102 'width': int_or_none(width_height[0]),
2103 'height': int_or_none(width_height[1]),
2104 }
2105 for fmt in streaming_formats:
2106 itag = str_or_none(fmt.get('itag'))
2107 if not itag:
2108 continue
2109 quality = fmt.get('quality')
2110 quality_label = fmt.get('qualityLabel') or quality
2111 formats_spec[itag] = {
2112 'asr': int_or_none(fmt.get('audioSampleRate')),
2113 'filesize': int_or_none(fmt.get('contentLength')),
2114 'format_note': quality_label,
2115 'fps': int_or_none(fmt.get('fps')),
2116 'height': int_or_none(fmt.get('height')),
2117 # bitrate for itag 43 is always 2147483647
2118 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
2119 'width': int_or_none(fmt.get('width')),
2120 }
2121
2122 for fmt in streaming_formats:
2123 if fmt.get('drmFamilies') or fmt.get('drm_families'):
2124 continue
2125 url = url_or_none(fmt.get('url'))
2126
2127 if not url:
2128 cipher = fmt.get('cipher') or fmt.get('signatureCipher')
2129 if not cipher:
2130 continue
2131 url_data = compat_parse_qs(cipher)
2132 url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
2133 if not url:
2134 continue
2135 else:
2136 cipher = None
2137 url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2138
2139 stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
2140 # Unsupported FORMAT_STREAM_TYPE_OTF
2141 if stream_type == 3:
2142 continue
2143
2144 format_id = fmt.get('itag') or url_data['itag'][0]
2145 if not format_id:
2146 continue
2147 format_id = compat_str(format_id)
2148
2149 if cipher:
2150 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
2151 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
2152 jsplayer_url_json = self._search_regex(
2153 ASSETS_RE,
2154 embed_webpage if age_gate else video_webpage,
2155 'JS player URL (1)', default=None)
2156 if not jsplayer_url_json and not age_gate:
2157 # We need the embed website after all
2158 if embed_webpage is None:
2159 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
2160 embed_webpage = self._download_webpage(
2161 embed_url, video_id, 'Downloading embed webpage')
2162 jsplayer_url_json = self._search_regex(
2163 ASSETS_RE, embed_webpage, 'JS player URL')
2164
2165 player_url = json.loads(jsplayer_url_json)
2166 if player_url is None:
2167 player_url_json = self._search_regex(
2168 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
2169 video_webpage, 'age gate player URL')
2170 player_url = json.loads(player_url_json)
2171
2172 if 'sig' in url_data:
2173 url += '&signature=' + url_data['sig'][0]
2174 elif 's' in url_data:
2175 encrypted_sig = url_data['s'][0]
2176
2177 if self._downloader.params.get('verbose'):
2178 if player_url is None:
2179 player_desc = 'unknown'
2180 else:
2181 player_type, player_version = self._extract_player_info(player_url)
2182 player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version)
2183 parts_sizes = self._signature_cache_id(encrypted_sig)
2184 self.to_screen('{%s} signature length %s, %s' %
2185 (format_id, parts_sizes, player_desc))
2186
2187 signature = self._decrypt_signature(
2188 encrypted_sig, video_id, player_url, age_gate)
2189 sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
2190 url += '&%s=%s' % (sp, signature)
2191 if 'ratebypass' not in url:
2192 url += '&ratebypass=yes'
2193
2194 dct = {
2195 'format_id': format_id,
2196 'url': url,
2197 'player_url': player_url,
2198 }
2199 if format_id in self._formats:
2200 dct.update(self._formats[format_id])
2201 if format_id in formats_spec:
2202 dct.update(formats_spec[format_id])
2203
2204 # Some itags are not included in DASH manifest thus corresponding formats will
2205 # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).
2206 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
2207 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
2208 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
2209
2210 if width is None:
2211 width = int_or_none(fmt.get('width'))
2212 if height is None:
2213 height = int_or_none(fmt.get('height'))
2214
2215 filesize = int_or_none(url_data.get(
2216 'clen', [None])[0]) or _extract_filesize(url)
2217
2218 quality = url_data.get('quality', [None])[0] or fmt.get('quality')
2219 quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
2220
2221 tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
2222 or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
2223 fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
2224
2225 more_fields = {
2226 'filesize': filesize,
2227 'tbr': tbr,
2228 'width': width,
2229 'height': height,
2230 'fps': fps,
2231 'format_note': quality_label or quality,
2232 }
2233 for key, value in more_fields.items():
2234 if value:
2235 dct[key] = value
2236 type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
2237 if type_:
2238 type_split = type_.split(';')
2239 kind_ext = type_split[0].split('/')
2240 if len(kind_ext) == 2:
2241 kind, _ = kind_ext
2242 dct['ext'] = mimetype2ext(type_split[0])
2243 if kind in ('audio', 'video'):
2244 codecs = None
2245 for mobj in re.finditer(
2246 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
2247 if mobj.group('key') == 'codecs':
2248 codecs = mobj.group('val')
2249 break
2250 if codecs:
2251 dct.update(parse_codecs(codecs))
2252 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
2253 dct['downloader_options'] = {
2254 # Youtube throttles chunks >~10M
2255 'http_chunk_size': 10485760,
2256 }
2257 formats.append(dct)
2258 else:
2259 manifest_url = (
2260 url_or_none(try_get(
2261 player_response,
2262 lambda x: x['streamingData']['hlsManifestUrl'],
2263 compat_str))
2264 or url_or_none(try_get(
2265 video_info, lambda x: x['hlsvp'][0], compat_str)))
2266 if manifest_url:
2267 formats = []
2268 m3u8_formats = self._extract_m3u8_formats(
2269 manifest_url, video_id, 'mp4', fatal=False)
2270 for a_format in m3u8_formats:
2271 itag = self._search_regex(
2272 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
2273 if itag:
2274 a_format['format_id'] = itag
2275 if itag in self._formats:
2276 dct = self._formats[itag].copy()
2277 dct.update(a_format)
2278 a_format = dct
2279 a_format['player_url'] = player_url
2280 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
2281 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
2282 if self._downloader.params.get('youtube_include_hls_manifest', True):
2283 formats.append(a_format)
2284 else:
2285 error_message = extract_unavailable_message()
2286 if not error_message:
2287 error_message = clean_html(try_get(
2288 player_response, lambda x: x['playabilityStatus']['reason'],
2289 compat_str))
2290 if not error_message:
2291 error_message = clean_html(
2292 try_get(video_info, lambda x: x['reason'][0], compat_str))
2293 if error_message:
2294 raise ExtractorError(error_message, expected=True)
2295 raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
2296
2297 # uploader
2298 video_uploader = try_get(
2299 video_info, lambda x: x['author'][0],
2300 compat_str) or str_or_none(video_details.get('author'))
2301 if video_uploader:
2302 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
2303 else:
2304 self._downloader.report_warning('unable to extract uploader name')
2305
2306 # uploader_id
2307 video_uploader_id = None
2308 video_uploader_url = None
2309 mobj = re.search(
2310 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
2311 video_webpage)
2312 if mobj is not None:
2313 video_uploader_id = mobj.group('uploader_id')
2314 video_uploader_url = mobj.group('uploader_url')
2315 else:
2316 owner_profile_url = url_or_none(microformat.get('ownerProfileUrl'))
2317 if owner_profile_url:
2318 video_uploader_id = self._search_regex(
2319 r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id',
2320 default=None)
2321 video_uploader_url = owner_profile_url
2322
2323 channel_id = (
2324 str_or_none(video_details.get('channelId'))
2325 or self._html_search_meta(
2326 'channelId', video_webpage, 'channel id', default=None)
2327 or self._search_regex(
2328 r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
2329 video_webpage, 'channel id', default=None, group='id'))
2330 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
2331
2332 thumbnails = []
2333 thumbnails_list = try_get(
2334 video_details, lambda x: x['thumbnail']['thumbnails'], list) or []
2335 for t in thumbnails_list:
2336 if not isinstance(t, dict):
2337 continue
2338 thumbnail_url = url_or_none(t.get('url'))
2339 if not thumbnail_url:
2340 continue
2341 thumbnails.append({
2342 'url': thumbnail_url,
2343 'width': int_or_none(t.get('width')),
2344 'height': int_or_none(t.get('height')),
2345 })
2346
2347 if not thumbnails:
2348 video_thumbnail = None
2349 # We try first to get a high quality image:
2350 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
2351 video_webpage, re.DOTALL)
2352 if m_thumb is not None:
2353 video_thumbnail = m_thumb.group(1)
2354 thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str)
2355 if thumbnail_url:
2356 video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url)
2357 if video_thumbnail:
2358 thumbnails.append({'url': video_thumbnail})
2359
2360 # upload date
2361 upload_date = self._html_search_meta(
2362 'datePublished', video_webpage, 'upload date', default=None)
2363 if not upload_date:
2364 upload_date = self._search_regex(
2365 [r'(?s)id="eow-date.*?>(.*?)</span>',
2366 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
2367 video_webpage, 'upload date', default=None)
2368 if not upload_date:
2369 upload_date = microformat.get('publishDate') or microformat.get('uploadDate')
2370 upload_date = unified_strdate(upload_date)
2371
2372 video_license = self._html_search_regex(
2373 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
2374 video_webpage, 'license', default=None)
2375
2376 m_music = re.search(
2377 r'''(?x)
2378 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
2379 <ul[^>]*>\s*
2380 <li>(?P<title>.+?)
2381 by (?P<creator>.+?)
2382 (?:
2383 \(.+?\)|
2384 <a[^>]*
2385 (?:
2386 \bhref=["\']/red[^>]*>| # drop possible
2387 >\s*Listen ad-free with YouTube Red # YouTube Red ad
2388 )
2389 .*?
2390 )?</li
2391 ''',
2392 video_webpage)
2393 if m_music:
2394 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
2395 video_creator = clean_html(m_music.group('creator'))
2396 else:
2397 video_alt_title = video_creator = None
2398
2399 def extract_meta(field):
2400 return self._html_search_regex(
2401 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
2402 video_webpage, field, default=None)
2403
2404 track = extract_meta('Song')
2405 artist = extract_meta('Artist')
2406 album = extract_meta('Album')
2407
2408 # Youtube Music Auto-generated description
2409 release_date = release_year = None
2410 if video_description:
2411 mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description)
2412 if mobj:
2413 if not track:
2414 track = mobj.group('track').strip()
2415 if not artist:
2416 artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))
2417 if not album:
2418 album = mobj.group('album'.strip())
2419 release_year = mobj.group('release_year')
2420 release_date = mobj.group('release_date')
2421 if release_date:
2422 release_date = release_date.replace('-', '')
2423 if not release_year:
2424 release_year = int(release_date[:4])
2425 if release_year:
2426 release_year = int(release_year)
2427
2428 m_episode = re.search(
2429 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
2430 video_webpage)
2431 if m_episode:
2432 series = unescapeHTML(m_episode.group('series'))
2433 season_number = int(m_episode.group('season'))
2434 episode_number = int(m_episode.group('episode'))
2435 else:
2436 series = season_number = episode_number = None
2437
2438 m_cat_container = self._search_regex(
2439 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
2440 video_webpage, 'categories', default=None)
2441 category = None
2442 if m_cat_container:
2443 category = self._html_search_regex(
2444 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
2445 default=None)
2446 if not category:
2447 category = try_get(
2448 microformat, lambda x: x['category'], compat_str)
2449 video_categories = None if category is None else [category]
2450
2451 video_tags = [
2452 unescapeHTML(m.group('content'))
2453 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
2454 if not video_tags:
2455 video_tags = try_get(video_details, lambda x: x['keywords'], list)
2456
2457 def _extract_count(count_name):
2458 return str_to_int(self._search_regex(
2459 r'"accessibilityData":\{"label":"([\d,\w]+) %ss"\}'
2460 % re.escape(count_name),
2461 video_webpage, count_name, default=None))
2462
2463 like_count = _extract_count('like')
2464 dislike_count = _extract_count('dislike')
2465
2466 if view_count is None:
2467 view_count = str_to_int(self._search_regex(
2468 r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
2469 'view count', default=None))
2470
2471 average_rating = (
2472 float_or_none(video_details.get('averageRating'))
2473 or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
2474
2475 # subtitles
2476 video_subtitles = self.extract_subtitles(
2477 video_id, video_webpage, has_live_chat_replay)
2478 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
2479
2480 video_duration = try_get(
2481 video_info, lambda x: int_or_none(x['length_seconds'][0]))
2482 if not video_duration:
2483 video_duration = int_or_none(video_details.get('lengthSeconds'))
2484 if not video_duration:
2485 video_duration = parse_duration(self._html_search_meta(
2486 'duration', video_webpage, 'video duration'))
2487
2488 # Get Subscriber Count of channel
2489 subscriber_count = parse_count(self._search_regex(
2490 r'"text":"([\d\.]+\w?) subscribers"',
2491 video_webpage,
2492 'subscriber count',
2493 default=None
2494 ))
2495
2496 # annotations
2497 video_annotations = None
2498 if self._downloader.params.get('writeannotations', False):
2499 xsrf_token = self._search_regex(
2500 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2',
2501 video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
2502 invideo_url = try_get(
2503 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
2504 if xsrf_token and invideo_url:
2505 xsrf_field_name = self._search_regex(
2506 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
2507 video_webpage, 'xsrf field name',
2508 group='xsrf_field_name', default='session_token')
2509 video_annotations = self._download_webpage(
2510 self._proto_relative_url(invideo_url),
2511 video_id, note='Downloading annotations',
2512 errnote='Unable to download video annotations', fatal=False,
2513 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
2514
2515 chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
2516
2517 # Look for the DASH manifest
2518 if self._downloader.params.get('youtube_include_dash_manifest', True):
2519 dash_mpd_fatal = True
2520 for mpd_url in dash_mpds:
2521 dash_formats = {}
2522 try:
2523 def decrypt_sig(mobj):
2524 s = mobj.group(1)
2525 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2526 return '/signature/%s' % dec_s
2527
2528 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2529
2530 for df in self._extract_mpd_formats(
2531 mpd_url, video_id, fatal=dash_mpd_fatal,
2532 formats_dict=self._formats):
2533 if not df.get('filesize'):
2534 df['filesize'] = _extract_filesize(df['url'])
2535 # Do not overwrite DASH format found in some previous DASH manifest
2536 if df['format_id'] not in dash_formats:
2537 dash_formats[df['format_id']] = df
2538 # Additional DASH manifests may end up in HTTP Error 403 therefore
2539 # allow them to fail without bug report message if we already have
2540 # some DASH manifest succeeded. This is temporary workaround to reduce
2541 # burst of bug reports until we figure out the reason and whether it
2542 # can be fixed at all.
2543 dash_mpd_fatal = False
2544 except (ExtractorError, KeyError) as e:
2545 self.report_warning(
2546 'Skipping DASH manifest: %r' % e, video_id)
2547 if dash_formats:
2548 # Remove the formats we found through non-DASH, they
2549 # contain less info and it can be wrong, because we use
2550 # fixed values (for example the resolution). See
2551 # https://github.com/ytdl-org/youtube-dl/issues/5774 for an
2552 # example.
2553 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
2554 formats.extend(dash_formats.values())
2555
2556 # Check for malformed aspect ratio
2557 stretched_m = re.search(
2558 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2559 video_webpage)
2560 if stretched_m:
2561 w = float(stretched_m.group('w'))
2562 h = float(stretched_m.group('h'))
2563 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2564 # We will only process correct ratios.
2565 if w > 0 and h > 0:
2566 ratio = w / h
2567 for f in formats:
2568 if f.get('vcodec') != 'none':
2569 f['stretched_ratio'] = ratio
2570
2571 if not formats:
2572 if 'reason' in video_info:
2573 if 'The uploader has not made this video available in your country.' in video_info['reason']:
2574 regions_allowed = self._html_search_meta(
2575 'regionsAllowed', video_webpage, default=None)
2576 countries = regions_allowed.split(',') if regions_allowed else None
2577 self.raise_geo_restricted(
2578 msg=video_info['reason'][0], countries=countries)
2579 reason = video_info['reason'][0]
2580 if 'Invalid parameters' in reason:
2581 unavailable_message = extract_unavailable_message()
2582 if unavailable_message:
2583 reason = unavailable_message
2584 raise ExtractorError(
2585 'YouTube said: %s' % reason,
2586 expected=True, video_id=video_id)
2587 if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']):
2588 raise ExtractorError('This video is DRM protected.', expected=True)
2589
2590 self._sort_formats(formats)
2591
2592 self.mark_watched(video_id, video_info, player_response)
2593
2594 return {
2595 'id': video_id,
2596 'uploader': video_uploader,
2597 'uploader_id': video_uploader_id,
2598 'uploader_url': video_uploader_url,
2599 'channel_id': channel_id,
2600 'channel_url': channel_url,
2601 'upload_date': upload_date,
2602 'license': video_license,
2603 'creator': video_creator or artist,
2604 'title': video_title,
2605 'alt_title': video_alt_title or track,
2606 'thumbnails': thumbnails,
2607 'description': video_description,
2608 'categories': video_categories,
2609 'tags': video_tags,
2610 'subtitles': video_subtitles,
2611 'automatic_captions': automatic_captions,
2612 'duration': video_duration,
2613 'age_limit': 18 if age_gate else 0,
2614 'annotations': video_annotations,
2615 'chapters': chapters,
2616 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
2617 'view_count': view_count,
2618 'like_count': like_count,
2619 'dislike_count': dislike_count,
2620 'average_rating': average_rating,
2621 'formats': formats,
2622 'is_live': is_live,
2623 'start_time': start_time,
2624 'end_time': end_time,
2625 'series': series,
2626 'season_number': season_number,
2627 'episode_number': episode_number,
2628 'track': track,
2629 'artist': artist,
2630 'album': album,
2631 'release_date': release_date,
2632 'release_year': release_year,
2633 'subscriber_count': subscriber_count,
2634 }
2635
2636
2637 class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
2638 IE_DESC = 'YouTube.com playlists'
2639 _VALID_URL = r"""(?x)(?:
2640 (?:https?://)?
2641 (?:\w+\.)?
2642 (?:
2643 (?:
2644 youtube(?:kids)?\.com|
2645 invidio\.us
2646 )
2647 /
2648 (?:
2649 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))
2650 \? (?:.*?[&;])*? (?:p|a|list)=
2651 | p/
2652 )|
2653 youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
2654 )
2655 (
2656 (?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)?[0-9A-Za-z-_]{10,}
2657 # Top tracks, they can also include dots
2658 |(?:MC)[\w\.]*
2659 )
2660 .*
2661 |
2662 (%(playlist_id)s)
2663 )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
2664 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
2665 _VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&amp;(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?'
2666 _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})'
2667 IE_NAME = 'youtube:playlist'
2668 _TESTS = [{
2669 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2670 'info_dict': {
2671 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2672 'uploader': 'Sergey M.',
2673 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2674 'title': 'youtube-dl public playlist',
2675 },
2676 'playlist_count': 1,
2677 }, {
2678 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2679 'info_dict': {
2680 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2681 'uploader': 'Sergey M.',
2682 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2683 'title': 'youtube-dl empty playlist',
2684 },
2685 'playlist_count': 0,
2686 }, {
2687 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2688 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2689 'info_dict': {
2690 'title': '29C3: Not my department',
2691 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2692 'uploader': 'Christiaan008',
2693 'uploader_id': 'ChRiStIaAn008',
2694 },
2695 'playlist_count': 96,
2696 }, {
2697 'note': 'issue #673',
2698 'url': 'PLBB231211A4F62143',
2699 'info_dict': {
2700 'title': '[OLD]Team Fortress 2 (Class-based LP)',
2701 'id': 'PLBB231211A4F62143',
2702 'uploader': 'Wickydoo',
2703 'uploader_id': 'Wickydoo',
2704 },
2705 'playlist_mincount': 26,
2706 }, {
2707 'note': 'Large playlist',
2708 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
2709 'info_dict': {
2710 'title': 'Uploads from Cauchemar',
2711 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
2712 'uploader': 'Cauchemar',
2713 'uploader_id': 'Cauchemar89',
2714 },
2715 'playlist_mincount': 799,
2716 }, {
2717 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2718 'info_dict': {
2719 'title': 'YDL_safe_search',
2720 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2721 },
2722 'playlist_count': 2,
2723 'skip': 'This playlist is private',
2724 }, {
2725 'note': 'embedded',
2726 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
2727 'playlist_count': 4,
2728 'info_dict': {
2729 'title': 'JODA15',
2730 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
2731 'uploader': 'milan',
2732 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
2733 }
2734 }, {
2735 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2736 'playlist_mincount': 485,
2737 'info_dict': {
2738 'title': '2018 Chinese New Singles (11/6 updated)',
2739 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2740 'uploader': 'LBK',
2741 'uploader_id': 'sdragonfang',
2742 }
2743 }, {
2744 'note': 'Embedded SWF player',
2745 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
2746 'playlist_count': 4,
2747 'info_dict': {
2748 'title': 'JODA7',
2749 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
2750 },
2751 'skip': 'This playlist does not exist',
2752 }, {
2753 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2754 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2755 'info_dict': {
2756 'title': 'Uploads from Interstellar Movie',
2757 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
2758 'uploader': 'Interstellar Movie',
2759 'uploader_id': 'InterstellarMovie1',
2760 },
2761 'playlist_mincount': 21,
2762 }, {
2763 # Playlist URL that does not actually serve a playlist
2764 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2765 'info_dict': {
2766 'id': 'FqZTN594JQw',
2767 'ext': 'webm',
2768 'title': "Smiley's People 01 detective, Adventure Series, Action",
2769 'uploader': 'STREEM',
2770 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
2771 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
2772 'upload_date': '20150526',
2773 'license': 'Standard YouTube License',
2774 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2775 'categories': ['People & Blogs'],
2776 'tags': list,
2777 'view_count': int,
2778 'like_count': int,
2779 'dislike_count': int,
2780 },
2781 'params': {
2782 'skip_download': True,
2783 },
2784 'skip': 'This video is not available.',
2785 'add_ie': [YoutubeIE.ie_key()],
2786 }, {
2787 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
2788 'info_dict': {
2789 'id': 'yeWKywCrFtk',
2790 'ext': 'mp4',
2791 'title': 'Small Scale Baler and Braiding Rugs',
2792 'uploader': 'Backus-Page House Museum',
2793 'uploader_id': 'backuspagemuseum',
2794 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
2795 'upload_date': '20161008',
2796 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
2797 'categories': ['Nonprofits & Activism'],
2798 'tags': list,
2799 'like_count': int,
2800 'dislike_count': int,
2801 },
2802 'params': {
2803 'noplaylist': True,
2804 'skip_download': True,
2805 },
2806 }, {
2807 # https://github.com/ytdl-org/youtube-dl/issues/21844
2808 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2809 'info_dict': {
2810 'title': 'Data Analysis with Dr Mike Pound',
2811 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2812 'uploader_id': 'Computerphile',
2813 'uploader': 'Computerphile',
2814 },
2815 'playlist_mincount': 11,
2816 }, {
2817 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
2818 'only_matching': True,
2819 }, {
2820 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
2821 'only_matching': True,
2822 }, {
2823 # music album playlist
2824 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
2825 'only_matching': True,
2826 }, {
2827 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',
2828 'only_matching': True,
2829 }, {
2830 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
2831 'only_matching': True,
2832 }]
2833
2834 def _real_initialize(self):
2835 self._login()
2836
2837 def extract_videos_from_page(self, page):
2838 ids_in_page = []
2839 titles_in_page = []
2840
2841 for item in re.findall(
2842 r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page):
2843 attrs = extract_attributes(item)
2844 video_id = attrs['data-video-id']
2845 video_title = unescapeHTML(attrs.get('data-title'))
2846 if video_title:
2847 video_title = video_title.strip()
2848 ids_in_page.append(video_id)
2849 titles_in_page.append(video_title)
2850
2851 # Fallback with old _VIDEO_RE
2852 self.extract_videos_from_page_impl(
2853 self._VIDEO_RE, page, ids_in_page, titles_in_page)
2854
2855 # Relaxed fallbacks
2856 self.extract_videos_from_page_impl(
2857 r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page,
2858 ids_in_page, titles_in_page)
2859 self.extract_videos_from_page_impl(
2860 r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page,
2861 ids_in_page, titles_in_page)
2862
2863 return zip(ids_in_page, titles_in_page)
2864
2865 def _extract_mix(self, playlist_id):
2866 # The mixes are generated from a single video
2867 # the id of the playlist is just 'RD' + video_id
2868 ids = []
2869 last_id = playlist_id[-11:]
2870 for n in itertools.count(1):
2871 url = 'https://www.youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
2872 webpage = self._download_webpage(
2873 url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))
2874 new_ids = orderedSet(re.findall(
2875 r'''(?xs)data-video-username=".*?".*?
2876 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
2877 webpage))
2878 # Fetch new pages until all the videos are repeated, it seems that
2879 # there are always 51 unique videos.
2880 new_ids = [_id for _id in new_ids if _id not in ids]
2881 if not new_ids:
2882 break
2883 ids.extend(new_ids)
2884 last_id = ids[-1]
2885
2886 url_results = self._ids_to_results(ids)
2887
2888 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
2889 title_span = (
2890 search_title('playlist-title')
2891 or search_title('title long-title')
2892 or search_title('title'))
2893 title = clean_html(title_span)
2894
2895 return self.playlist_result(url_results, playlist_id, title)
2896
2897 def _extract_playlist(self, playlist_id):
2898 url = self._TEMPLATE_URL % playlist_id
2899 page = self._download_webpage(url, playlist_id)
2900
2901 # the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604)
2902 for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page):
2903 match = match.strip()
2904 # Check if the playlist exists or is private
2905 mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match)
2906 if mobj:
2907 reason = mobj.group('reason')
2908 message = 'This playlist %s' % reason
2909 if 'private' in reason:
2910 message += ', use --username or --netrc to access it'
2911 message += '.'
2912 raise ExtractorError(message, expected=True)
2913 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
2914 raise ExtractorError(
2915 'Invalid parameters. Maybe URL is incorrect.',
2916 expected=True)
2917 elif re.match(r'[^<]*Choose your language[^<]*', match):
2918 continue
2919 else:
2920 self.report_warning('Youtube gives an alert message: ' + match)
2921
2922 playlist_title = self._html_search_regex(
2923 r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
2924 page, 'title', default=None)
2925
2926 _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref='
2927 uploader = self._html_search_regex(
2928 r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,
2929 page, 'uploader', default=None)
2930 mobj = re.search(
2931 r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE,
2932 page)
2933 if mobj:
2934 uploader_id = mobj.group('uploader_id')
2935 uploader_url = compat_urlparse.urljoin(url, mobj.group('path'))
2936 else:
2937 uploader_id = uploader_url = None
2938
2939 has_videos = True
2940
2941 if not playlist_title:
2942 try:
2943 # Some playlist URLs don't actually serve a playlist (e.g.
2944 # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)
2945 next(self._entries(page, playlist_id))
2946 except StopIteration:
2947 has_videos = False
2948
2949 playlist = self.playlist_result(
2950 self._entries(page, playlist_id), playlist_id, playlist_title)
2951 playlist.update({
2952 'uploader': uploader,
2953 'uploader_id': uploader_id,
2954 'uploader_url': uploader_url,
2955 })
2956
2957 return has_videos, playlist
2958
2959 def _check_download_just_video(self, url, playlist_id):
2960 # Check if it's a video-specific URL
2961 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
2962 video_id = query_dict.get('v', [None])[0] or self._search_regex(
2963 r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url,
2964 'video id', default=None)
2965 if video_id:
2966 if self._downloader.params.get('noplaylist'):
2967 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
2968 return video_id, self.url_result(video_id, 'Youtube', video_id=video_id)
2969 else:
2970 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
2971 return video_id, None
2972 return None, None
2973
2974 def _real_extract(self, url):
2975 # Extract playlist id
2976 mobj = re.match(self._VALID_URL, url)
2977 if mobj is None:
2978 raise ExtractorError('Invalid URL: %s' % url)
2979 playlist_id = mobj.group(1) or mobj.group(2)
2980
2981 video_id, video = self._check_download_just_video(url, playlist_id)
2982 if video:
2983 return video
2984
2985 if playlist_id.startswith(('RD', 'UL', 'PU')):
2986 # Mixes require a custom extraction process
2987 return self._extract_mix(playlist_id)
2988
2989 has_videos, playlist = self._extract_playlist(playlist_id)
2990 if has_videos or not video_id:
2991 return playlist
2992
2993 # Some playlist URLs don't actually serve a playlist (see
2994 # https://github.com/ytdl-org/youtube-dl/issues/10537).
2995 # Fallback to plain video extraction if there is a video id
2996 # along with playlist id.
2997 return self.url_result(video_id, 'Youtube', video_id=video_id)
2998
2999
3000 class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
3001 IE_DESC = 'YouTube.com channels'
3002 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie|kids)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)'
3003 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
3004 _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
3005 IE_NAME = 'youtube:channel'
3006 _TESTS = [{
3007 'note': 'paginated channel',
3008 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
3009 'playlist_mincount': 91,
3010 'info_dict': {
3011 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
3012 'title': 'Uploads from lex will',
3013 'uploader': 'lex will',
3014 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3015 }
3016 }, {
3017 'note': 'Age restricted channel',
3018 # from https://www.youtube.com/user/DeusExOfficial
3019 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',
3020 'playlist_mincount': 64,
3021 'info_dict': {
3022 'id': 'UUs0ifCMCm1icqRbqhUINa0w',
3023 'title': 'Uploads from Deus Ex',
3024 'uploader': 'Deus Ex',
3025 'uploader_id': 'DeusExOfficial',
3026 },
3027 }, {
3028 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',
3029 'only_matching': True,
3030 }, {
3031 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA',
3032 'only_matching': True,
3033 }]
3034
3035 @classmethod
3036 def suitable(cls, url):
3037 return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)
3038 else super(YoutubeChannelIE, cls).suitable(url))
3039
3040 def _build_template_url(self, url, channel_id):
3041 return self._TEMPLATE_URL % channel_id
3042
3043 def _real_extract(self, url):
3044 channel_id = self._match_id(url)
3045
3046 url = self._build_template_url(url, channel_id)
3047
3048 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
3049 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
3050 # otherwise fallback on channel by page extraction
3051 channel_page = self._download_webpage(
3052 url + '?view=57', channel_id,
3053 'Downloading channel page', fatal=False)
3054 if channel_page is False:
3055 channel_playlist_id = False
3056 else:
3057 channel_playlist_id = self._html_search_meta(
3058 'channelId', channel_page, 'channel id', default=None)
3059 if not channel_playlist_id:
3060 channel_url = self._html_search_meta(
3061 ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),
3062 channel_page, 'channel url', default=None)
3063 if channel_url:
3064 channel_playlist_id = self._search_regex(
3065 r'vnd\.youtube://user/([0-9A-Za-z_-]+)',
3066 channel_url, 'channel id', default=None)
3067 if channel_playlist_id and channel_playlist_id.startswith('UC'):
3068 playlist_id = 'UU' + channel_playlist_id[2:]
3069 return self.url_result(
3070 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
3071
3072 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
3073 autogenerated = re.search(r'''(?x)
3074 class="[^"]*?(?:
3075 channel-header-autogenerated-label|
3076 yt-channel-title-autogenerated
3077 )[^"]*"''', channel_page) is not None
3078
3079 if autogenerated:
3080 # The videos are contained in a single page
3081 # the ajax pages can't be used, they are empty
3082 entries = [
3083 self.url_result(
3084 video_id, 'Youtube', video_id=video_id,
3085 video_title=video_title)
3086 for video_id, video_title in self.extract_videos_from_page(channel_page)]
3087 return self.playlist_result(entries, channel_id)
3088
3089 try:
3090 next(self._entries(channel_page, channel_id))
3091 except StopIteration:
3092 alert_message = self._html_search_regex(
3093 r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',
3094 channel_page, 'alert', default=None, group='alert')
3095 if alert_message:
3096 raise ExtractorError('Youtube said: %s' % alert_message, expected=True)
3097
3098 return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
3099
3100
3101 class YoutubeUserIE(YoutubeChannelIE):
3102 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
3103 _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9%-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_%-]+)'
3104 _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
3105 IE_NAME = 'youtube:user'
3106
3107 _TESTS = [{
3108 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
3109 'playlist_mincount': 320,
3110 'info_dict': {
3111 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
3112 'title': 'Uploads from The Linux Foundation',
3113 'uploader': 'The Linux Foundation',
3114 'uploader_id': 'TheLinuxFoundation',
3115 }
3116 }, {
3117 # Only available via https://www.youtube.com/c/12minuteathlete/videos
3118 # but not https://www.youtube.com/user/12minuteathlete/videos
3119 'url': 'https://www.youtube.com/c/12minuteathlete/videos',
3120 'playlist_mincount': 249,
3121 'info_dict': {
3122 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
3123 'title': 'Uploads from 12 Minute Athlete',
3124 'uploader': '12 Minute Athlete',
3125 'uploader_id': 'the12minuteathlete',
3126 }
3127 }, {
3128 'url': 'ytuser:phihag',
3129 'only_matching': True,
3130 }, {
3131 'url': 'https://www.youtube.com/c/gametrailers',
3132 'only_matching': True,
3133 }, {
3134 'url': 'https://www.youtube.com/c/Pawe%C5%82Zadro%C5%BCniak',
3135 'only_matching': True,
3136 }, {
3137 'url': 'https://www.youtube.com/gametrailers',
3138 'only_matching': True,
3139 }, {
3140 # This channel is not available, geo restricted to JP
3141 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',
3142 'only_matching': True,
3143 }]
3144
3145 @classmethod
3146 def suitable(cls, url):
3147 # Don't return True if the url can be extracted with other youtube
3148 # extractor, the regex would is too permissive and it would match.
3149 other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)
3150 if any(ie.suitable(url) for ie in other_yt_ies):
3151 return False
3152 else:
3153 return super(YoutubeUserIE, cls).suitable(url)
3154
3155 def _build_template_url(self, url, channel_id):
3156 mobj = re.match(self._VALID_URL, url)
3157 return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))
3158
3159
3160 class YoutubeLiveIE(YoutubeBaseInfoExtractor):
3161 IE_DESC = 'YouTube.com live streams'
3162 _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'
3163 IE_NAME = 'youtube:live'
3164
3165 _TESTS = [{
3166 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
3167 'info_dict': {
3168 'id': 'a48o2S1cPoo',
3169 'ext': 'mp4',
3170 'title': 'The Young Turks - Live Main Show',
3171 'uploader': 'The Young Turks',
3172 'uploader_id': 'TheYoungTurks',
3173 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
3174 'upload_date': '20150715',
3175 'license': 'Standard YouTube License',
3176 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
3177 'categories': ['News & Politics'],
3178 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
3179 'like_count': int,
3180 'dislike_count': int,
3181 },
3182 'params': {
3183 'skip_download': True,
3184 },
3185 }, {
3186 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
3187 'only_matching': True,
3188 }, {
3189 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
3190 'only_matching': True,
3191 }, {
3192 'url': 'https://www.youtube.com/TheYoungTurks/live',
3193 'only_matching': True,
3194 }]
3195
3196 def _real_extract(self, url):
3197 mobj = re.match(self._VALID_URL, url)
3198 channel_id = mobj.group('id')
3199 base_url = mobj.group('base_url')
3200 webpage = self._download_webpage(url, channel_id, fatal=False)
3201 if webpage:
3202 page_type = self._og_search_property(
3203 'type', webpage, 'page type', default='')
3204 video_id = self._html_search_meta(
3205 'videoId', webpage, 'video id', default=None)
3206 if page_type.startswith('video') and video_id and re.match(
3207 r'^[0-9A-Za-z_-]{11}$', video_id):
3208 return self.url_result(video_id, YoutubeIE.ie_key())
3209 return self.url_result(base_url)
3210
3211
3212 class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
3213 IE_DESC = 'YouTube.com user/channel playlists'
3214 _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel|c)/(?P<id>[^/]+)/playlists'
3215 IE_NAME = 'youtube:playlists'
3216
3217 _TESTS = [{
3218 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
3219 'playlist_mincount': 4,
3220 'info_dict': {
3221 'id': 'ThirstForScience',
3222 'title': 'ThirstForScience',
3223 },
3224 }, {
3225 # with "Load more" button
3226 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
3227 'playlist_mincount': 70,
3228 'info_dict': {
3229 'id': 'igorkle1',
3230 'title': 'Игорь Клейнер',
3231 },
3232 }, {
3233 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',
3234 'playlist_mincount': 17,
3235 'info_dict': {
3236 'id': 'UCiU1dHvZObB2iP6xkJ__Icw',
3237 'title': 'Chem Player',
3238 },
3239 'skip': 'Blocked',
3240 }, {
3241 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
3242 'only_matching': True,
3243 }]
3244
3245
3246 class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):
3247 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'
3248
3249
3250 class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
3251 IE_DESC = 'YouTube.com searches'
3252 # there doesn't appear to be a real limit, for example if you search for
3253 # 'python' you get more than 8.000.000 results
3254 _MAX_RESULTS = float('inf')
3255 IE_NAME = 'youtube:search'
3256 _SEARCH_KEY = 'ytsearch'
3257 _EXTRA_QUERY_ARGS = {}
3258 _TESTS = []
3259
3260 def _get_n_results(self, query, n):
3261 """Get a specified number of results for a query"""
3262
3263 videos = []
3264 limit = n
3265
3266 url_query = {
3267 'search_query': query.encode('utf-8'),
3268 }
3269 url_query.update(self._EXTRA_QUERY_ARGS)
3270 result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query)
3271
3272 for pagenum in itertools.count(1):
3273 data = self._download_json(
3274 result_url, video_id='query "%s"' % query,
3275 note='Downloading page %s' % pagenum,
3276 errnote='Unable to download API page',
3277 query={'spf': 'navigate'})
3278 html_content = data[1]['body']['content']
3279
3280 if 'class="search-message' in html_content:
3281 raise ExtractorError(
3282 '[youtube] No video results', expected=True)
3283
3284 new_videos = list(self._process_page(html_content))
3285 videos += new_videos
3286 if not new_videos or len(videos) > limit:
3287 break
3288 next_link = self._html_search_regex(
3289 r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next',
3290 html_content, 'next link', default=None)
3291 if next_link is None:
3292 break
3293 result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link)
3294
3295 if len(videos) > n:
3296 videos = videos[:n]
3297 return self.playlist_result(videos, query)
3298
3299
3300 class YoutubeSearchDateIE(YoutubeSearchIE):
3301 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
3302 _SEARCH_KEY = 'ytsearchdate'
3303 IE_DESC = 'YouTube.com searches, newest videos first'
3304 _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
3305
3306
3307 class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):
3308 IE_DESC = 'YouTube.com search URLs'
3309 IE_NAME = 'youtube:search_url'
3310 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
3311 _SEARCH_DATA = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});'
3312 _TESTS = [{
3313 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
3314 'playlist_mincount': 5,
3315 'info_dict': {
3316 'title': 'youtube-dl test video',
3317 }
3318 }, {
3319 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
3320 'only_matching': True,
3321 }]
3322
3323 def _find_videos_in_json(self, extracted):
3324 videos = []
3325
3326 def _real_find(obj):
3327 if obj is None or isinstance(obj, str):
3328 return
3329
3330 if type(obj) is list:
3331 for elem in obj:
3332 _real_find(elem)
3333
3334 if type(obj) is dict:
3335 if "videoId" in obj:
3336 videos.append(obj)
3337 return
3338
3339 for _, o in obj.items():
3340 _real_find(o)
3341
3342 _real_find(extracted)
3343
3344 return videos
3345
3346 def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page):
3347 search_response = self._parse_json(self._search_regex(self._SEARCH_DATA, page, 'ytInitialData'), None)
3348
3349 result_items = self._find_videos_in_json(search_response)
3350
3351 for renderer in result_items:
3352 video_id = try_get(renderer, lambda x: x['videoId'])
3353 video_title = try_get(renderer, lambda x: x['title']['runs'][0]['text']) or try_get(renderer, lambda x: x['title']['simpleText'])
3354
3355 if video_id is None or video_title is None:
3356 # we do not have a videoRenderer or title extraction broke
3357 continue
3358
3359 video_title = video_title.strip()
3360
3361 try:
3362 idx = ids_in_page.index(video_id)
3363 if video_title and not titles_in_page[idx]:
3364 titles_in_page[idx] = video_title
3365 except ValueError:
3366 ids_in_page.append(video_id)
3367 titles_in_page.append(video_title)
3368
3369 def extract_videos_from_page(self, page):
3370 ids_in_page = []
3371 titles_in_page = []
3372 self.extract_videos_from_page_impl(page, ids_in_page, titles_in_page)
3373 return zip(ids_in_page, titles_in_page)
3374
3375 def _real_extract(self, url):
3376 mobj = re.match(self._VALID_URL, url)
3377 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
3378 webpage = self._download_webpage(url, query)
3379 return self.playlist_result(self._process_page(webpage), playlist_title=query)
3380
3381
3382 class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
3383 IE_DESC = 'YouTube.com (multi-season) shows'
3384 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
3385 IE_NAME = 'youtube:show'
3386 _TESTS = [{
3387 'url': 'https://www.youtube.com/show/airdisasters',
3388 'playlist_mincount': 5,
3389 'info_dict': {
3390 'id': 'airdisasters',
3391 'title': 'Air Disasters',
3392 }
3393 }]
3394
3395 def _real_extract(self, url):
3396 playlist_id = self._match_id(url)
3397 return super(YoutubeShowIE, self)._real_extract(
3398 'https://www.youtube.com/show/%s/playlists' % playlist_id)
3399
3400
3401 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
3402 """
3403 Base class for feed extractors
3404 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
3405 """
3406 _LOGIN_REQUIRED = True
3407 _FEED_DATA = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});'
3408 _YTCFG_DATA = r"ytcfg.set\(({.*?})\)"
3409
3410 @property
3411 def IE_NAME(self):
3412 return 'youtube:%s' % self._FEED_NAME
3413
3414 def _real_initialize(self):
3415 self._login()
3416
3417 def _find_videos_in_json(self, extracted):
3418 videos = []
3419 c = {}
3420
3421 def _real_find(obj):
3422 if obj is None or isinstance(obj, str):
3423 return
3424
3425 if type(obj) is list:
3426 for elem in obj:
3427 _real_find(elem)
3428
3429 if type(obj) is dict:
3430 if "videoId" in obj:
3431 videos.append(obj)
3432 return
3433
3434 if "nextContinuationData" in obj:
3435 c["continuation"] = obj["nextContinuationData"]
3436 return
3437
3438 for _, o in obj.items():
3439 _real_find(o)
3440
3441 _real_find(extracted)
3442
3443 return videos, try_get(c, lambda x: x["continuation"])
3444
3445 def _entries(self, page):
3446 info = []
3447
3448 yt_conf = self._parse_json(self._search_regex(self._YTCFG_DATA, page, 'ytcfg.set', default="null"), None, fatal=False)
3449
3450 search_response = self._parse_json(self._search_regex(self._FEED_DATA, page, 'ytInitialData'), None)
3451
3452 for page_num in itertools.count(1):
3453 video_info, continuation = self._find_videos_in_json(search_response)
3454
3455 new_info = []
3456
3457 for v in video_info:
3458 v_id = try_get(v, lambda x: x['videoId'])
3459 if not v_id:
3460 continue
3461
3462 have_video = False
3463 for old in info:
3464 if old['videoId'] == v_id:
3465 have_video = True
3466 break
3467
3468 if not have_video:
3469 new_info.append(v)
3470
3471 if not new_info:
3472 break
3473
3474 info.extend(new_info)
3475
3476 for video in new_info:
3477 yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['runs'][0]['text']) or try_get(video, lambda x: x['title']['simpleText']))
3478
3479 if not continuation or not yt_conf:
3480 break
3481
3482 search_response = self._download_json(
3483 'https://www.youtube.com/browse_ajax', self._PLAYLIST_TITLE,
3484 'Downloading page #%s' % page_num,
3485 transform_source=uppercase_escape,
3486 query={
3487 "ctoken": try_get(continuation, lambda x: x["continuation"]),
3488 "continuation": try_get(continuation, lambda x: x["continuation"]),
3489 "itct": try_get(continuation, lambda x: x["clickTrackingParams"])
3490 },
3491 headers={
3492 "X-YouTube-Client-Name": try_get(yt_conf, lambda x: x["INNERTUBE_CONTEXT_CLIENT_NAME"]),
3493 "X-YouTube-Client-Version": try_get(yt_conf, lambda x: x["INNERTUBE_CONTEXT_CLIENT_VERSION"]),
3494 "X-Youtube-Identity-Token": try_get(yt_conf, lambda x: x["ID_TOKEN"]),
3495 "X-YouTube-Device": try_get(yt_conf, lambda x: x["DEVICE"]),
3496 "X-YouTube-Page-CL": try_get(yt_conf, lambda x: x["PAGE_CL"]),
3497 "X-YouTube-Page-Label": try_get(yt_conf, lambda x: x["PAGE_BUILD_LABEL"]),
3498 "X-YouTube-Variants-Checksum": try_get(yt_conf, lambda x: x["VARIANTS_CHECKSUM"]),
3499 })
3500
3501 def _real_extract(self, url):
3502 page = self._download_webpage(
3503 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
3504 self._PLAYLIST_TITLE)
3505 return self.playlist_result(
3506 self._entries(page), playlist_title=self._PLAYLIST_TITLE)
3507
3508
3509 class YoutubeWatchLaterIE(YoutubePlaylistIE):
3510 IE_NAME = 'youtube:watchlater'
3511 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
3512 _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'
3513
3514 _TESTS = [{
3515 'url': 'https://www.youtube.com/playlist?list=WL',
3516 'only_matching': True,
3517 }, {
3518 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',
3519 'only_matching': True,
3520 }]
3521
3522 def _real_extract(self, url):
3523 _, video = self._check_download_just_video(url, 'WL')
3524 if video:
3525 return video
3526 _, playlist = self._extract_playlist('WL')
3527 return playlist
3528
3529
3530 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
3531 IE_NAME = 'youtube:favorites'
3532 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
3533 _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
3534 _LOGIN_REQUIRED = True
3535
3536 def _real_extract(self, url):
3537 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
3538 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
3539 return self.url_result(playlist_id, 'YoutubePlaylist')
3540
3541
3542 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3543 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
3544 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'
3545 _FEED_NAME = 'recommended'
3546 _PLAYLIST_TITLE = 'Youtube Recommended videos'
3547
3548
3549 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
3550 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
3551 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
3552 _FEED_NAME = 'subscriptions'
3553 _PLAYLIST_TITLE = 'Youtube Subscriptions'
3554
3555
3556 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
3557 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
3558 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
3559 _FEED_NAME = 'history'
3560 _PLAYLIST_TITLE = 'Youtube History'
3561
3562
3563 class YoutubeTruncatedURLIE(InfoExtractor):
3564 IE_NAME = 'youtube:truncated_url'
3565 IE_DESC = False # Do not list
3566 _VALID_URL = r'''(?x)
3567 (?:https?://)?
3568 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3569 (?:watch\?(?:
3570 feature=[a-z_]+|
3571 annotation_id=annotation_[^&]+|
3572 x-yt-cl=[0-9]+|
3573 hl=[^&]*|
3574 t=[0-9]+
3575 )?
3576 |
3577 attribution_link\?a=[^&]+
3578 )
3579 $
3580 '''
3581
3582 _TESTS = [{
3583 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
3584 'only_matching': True,
3585 }, {
3586 'url': 'https://www.youtube.com/watch?',
3587 'only_matching': True,
3588 }, {
3589 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3590 'only_matching': True,
3591 }, {
3592 'url': 'https://www.youtube.com/watch?feature=foo',
3593 'only_matching': True,
3594 }, {
3595 'url': 'https://www.youtube.com/watch?hl=en-GB',
3596 'only_matching': True,
3597 }, {
3598 'url': 'https://www.youtube.com/watch?t=2372',
3599 'only_matching': True,
3600 }]
3601
3602 def _real_extract(self, url):
3603 raise ExtractorError(
3604 'Did you forget to quote the URL? Remember that & is a meta '
3605 'character in most shells, so you want to put the URL in quotes, '
3606 'like youtube-dl '
3607 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3608 ' or simply youtube-dl BaW_jenozKc .',
3609 expected=True)
3610
3611
3612 class YoutubeTruncatedIDIE(InfoExtractor):
3613 IE_NAME = 'youtube:truncated_id'
3614 IE_DESC = False # Do not list
3615 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
3616
3617 _TESTS = [{
3618 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3619 'only_matching': True,
3620 }]
3621
3622 def _real_extract(self, url):
3623 video_id = self._match_id(url)
3624 raise ExtractorError(
3625 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3626 expected=True)